2018-04-19 13:37:59 +00:00
|
|
|
/*-
|
2019-02-04 21:28:25 +00:00
|
|
|
* Copyright (c) 2016-2018 Netflix, Inc.
|
2018-04-19 13:37:59 +00:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
|
|
|
#include "opt_inet.h"
|
|
|
|
#include "opt_inet6.h"
|
2020-03-03 14:15:30 +00:00
|
|
|
#include "opt_rss.h"
|
2018-04-19 13:37:59 +00:00
|
|
|
#include "opt_tcpdebug.h"
|
2020-03-03 14:15:30 +00:00
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
/**
|
|
|
|
* Some notes about usage.
|
|
|
|
*
|
|
|
|
* The tcp_hpts system is designed to provide a high precision timer
|
2020-02-12 13:07:09 +00:00
|
|
|
* system for tcp. Its main purpose is to provide a mechanism for
|
2018-04-19 13:37:59 +00:00
|
|
|
* pacing packets out onto the wire. It can be used in two ways
|
|
|
|
* by a given TCP stack (and those two methods can be used simultaneously).
|
|
|
|
*
|
2019-07-10 20:40:39 +00:00
|
|
|
* First, and probably the main thing its used by Rack and BBR, it can
|
2018-04-19 13:37:59 +00:00
|
|
|
* be used to call tcp_output() of a transport stack at some time in the future.
|
|
|
|
* The normal way this is done is that tcp_output() of the stack schedules
|
|
|
|
* itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
|
|
|
|
* slot is the time from now that the stack wants to be called but it
|
|
|
|
* must be converted to tcp_hpts's notion of slot. This is done with
|
|
|
|
* one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
|
|
|
|
* call from the tcp_output() routine might look like:
|
|
|
|
*
|
|
|
|
* tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
|
|
|
|
*
|
|
|
|
* The above would schedule tcp_ouput() to be called in 550 useconds.
|
|
|
|
* Note that if using this mechanism the stack will want to add near
|
|
|
|
* its top a check to prevent unwanted calls (from user land or the
|
|
|
|
* arrival of incoming ack's). So it would add something like:
|
|
|
|
*
|
2021-12-02 22:45:04 +00:00
|
|
|
* if (tcp_in_hpts(inp))
|
2018-04-19 13:37:59 +00:00
|
|
|
* return;
|
|
|
|
*
|
|
|
|
* to prevent output processing until the time alotted has gone by.
|
|
|
|
* Of course this is a bare bones example and the stack will probably
|
|
|
|
* have more consideration then just the above.
|
2020-02-12 13:07:09 +00:00
|
|
|
*
|
2021-12-02 22:45:04 +00:00
|
|
|
* In order to run input queued segments from the HPTS context the
|
2020-02-12 13:07:09 +00:00
|
|
|
* tcp stack must define an input function for
|
2019-07-10 20:40:39 +00:00
|
|
|
* tfb_do_queued_segments(). This function understands
|
|
|
|
* how to dequeue a array of packets that were input and
|
2020-02-12 13:07:09 +00:00
|
|
|
* knows how to call the correct processing routine.
|
2018-04-19 13:37:59 +00:00
|
|
|
*
|
2020-02-12 13:07:09 +00:00
|
|
|
* Locking in this is important as well so most likely the
|
2019-07-10 20:40:39 +00:00
|
|
|
* stack will need to define the tfb_do_segment_nounlock()
|
|
|
|
* splitting tfb_do_segment() into two parts. The main processing
|
|
|
|
* part that does not unlock the INP and returns a value of 1 or 0.
|
|
|
|
* It returns 0 if all is well and the lock was not released. It
|
|
|
|
* returns 1 if we had to destroy the TCB (a reset received etc).
|
|
|
|
* The remains of tfb_do_segment() then become just a simple call
|
|
|
|
* to the tfb_do_segment_nounlock() function and check the return
|
|
|
|
* code and possibly unlock.
|
2020-02-12 13:07:09 +00:00
|
|
|
*
|
2019-07-10 20:40:39 +00:00
|
|
|
* The stack must also set the flag on the INP that it supports this
|
|
|
|
* feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
|
|
|
|
* this flag as well and will queue packets when it is set.
|
|
|
|
* There are other flags as well INP_MBUF_QUEUE_READY and
|
|
|
|
* INP_DONT_SACK_QUEUE. The first flag tells the LRO code
|
|
|
|
* that we are in the pacer for output so there is no
|
|
|
|
* need to wake up the hpts system to get immediate
|
|
|
|
* input. The second tells the LRO code that its okay
|
|
|
|
* if a SACK arrives you can still defer input and let
|
|
|
|
* the current hpts timer run (this is usually set when
|
|
|
|
* a rack timer is up so we know SACK's are happening
|
|
|
|
* on the connection already and don't want to wakeup yet).
|
|
|
|
*
|
|
|
|
* There is a common functions within the rack_bbr_common code
|
|
|
|
* version i.e. ctf_do_queued_segments(). This function
|
2020-02-12 13:07:09 +00:00
|
|
|
* knows how to take the input queue of packets from
|
|
|
|
* tp->t_in_pkts and process them digging out
|
|
|
|
* all the arguments, calling any bpf tap and
|
2019-07-10 20:40:39 +00:00
|
|
|
* calling into tfb_do_segment_nounlock(). The common
|
2020-02-12 13:07:09 +00:00
|
|
|
* function (ctf_do_queued_segments()) requires that
|
2019-07-10 20:40:39 +00:00
|
|
|
* you have defined the tfb_do_segment_nounlock() as
|
|
|
|
* described above.
|
|
|
|
*
|
2021-12-02 22:45:04 +00:00
|
|
|
* Now the second function the tcp_hpts system provides is the ability
|
|
|
|
* to abort a connection later. Why would you want to do this?
|
|
|
|
* To not have to worry about untangling any recursive locks.
|
|
|
|
*
|
2019-07-10 20:40:39 +00:00
|
|
|
* The second feature of the input side of hpts is the
|
|
|
|
* dropping of a connection. This is due to the way that
|
|
|
|
* locking may have occured on the INP_WLOCK. So if
|
2018-04-19 13:37:59 +00:00
|
|
|
* a stack wants to drop a connection it calls:
|
|
|
|
*
|
|
|
|
* tcp_set_inp_to_drop(tp, ETIMEDOUT)
|
2020-02-12 13:07:09 +00:00
|
|
|
*
|
|
|
|
* To schedule the tcp_hpts system to call
|
|
|
|
*
|
2018-04-19 13:37:59 +00:00
|
|
|
* tcp_drop(tp, drop_reason)
|
|
|
|
*
|
|
|
|
* at a future point. This is quite handy to prevent locking
|
|
|
|
* issues when dropping connections.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/bus.h>
|
|
|
|
#include <sys/interrupt.h>
|
|
|
|
#include <sys/module.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/hhook.h>
|
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mbuf.h>
|
|
|
|
#include <sys/proc.h> /* for proc0 declaration */
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
|
|
|
#include <sys/sysctl.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/refcount.h>
|
|
|
|
#include <sys/sched.h>
|
|
|
|
#include <sys/queue.h>
|
|
|
|
#include <sys/smp.h>
|
|
|
|
#include <sys/counter.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/kthread.h>
|
|
|
|
#include <sys/kern_prefetch.h>
|
|
|
|
|
|
|
|
#include <vm/uma.h>
|
2019-05-10 13:41:19 +00:00
|
|
|
#include <vm/vm.h>
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
#include <net/route.h>
|
|
|
|
#include <net/vnet.h>
|
|
|
|
|
2020-03-03 14:15:30 +00:00
|
|
|
#ifdef RSS
|
|
|
|
#include <net/netisr.h>
|
|
|
|
#include <net/rss_config.h>
|
|
|
|
#endif
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
#define TCPSTATES /* for logging */
|
|
|
|
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <netinet/in_kdtrace.h>
|
|
|
|
#include <netinet/in_pcb.h>
|
|
|
|
#include <netinet/ip.h>
|
|
|
|
#include <netinet/ip_icmp.h> /* required for icmp_var.h */
|
|
|
|
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
|
|
|
|
#include <netinet/ip_var.h>
|
|
|
|
#include <netinet/ip6.h>
|
|
|
|
#include <netinet6/in6_pcb.h>
|
|
|
|
#include <netinet6/ip6_var.h>
|
|
|
|
#include <netinet/tcp.h>
|
|
|
|
#include <netinet/tcp_fsm.h>
|
|
|
|
#include <netinet/tcp_seq.h>
|
|
|
|
#include <netinet/tcp_timer.h>
|
|
|
|
#include <netinet/tcp_var.h>
|
|
|
|
#include <netinet/tcpip.h>
|
|
|
|
#include <netinet/cc/cc.h>
|
|
|
|
#include <netinet/tcp_hpts.h>
|
2019-07-10 20:40:39 +00:00
|
|
|
#include <netinet/tcp_log_buf.h>
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
#ifdef tcpdebug
|
|
|
|
#include <netinet/tcp_debug.h>
|
|
|
|
#endif /* tcpdebug */
|
|
|
|
#ifdef tcp_offload
|
|
|
|
#include <netinet/tcp_offload.h>
|
|
|
|
#endif
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
/*
|
|
|
|
* The hpts uses a 102400 wheel. The wheel
|
|
|
|
* defines the time in 10 usec increments (102400 x 10).
|
|
|
|
* This gives a range of 10usec - 1024ms to place
|
|
|
|
* an entry within. If the user requests more than
|
|
|
|
* 1.024 second, a remaineder is attached and the hpts
|
|
|
|
* when seeing the remainder will re-insert the
|
|
|
|
* inpcb forward in time from where it is until
|
|
|
|
* the remainder is zero.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define NUM_OF_HPTSI_SLOTS 102400
|
|
|
|
|
|
|
|
/* Each hpts has its own p_mtx which is used for locking */
|
|
|
|
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
|
|
|
|
#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
|
|
|
|
#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
|
|
|
|
struct tcp_hpts_entry {
|
|
|
|
/* Cache line 0x00 */
|
|
|
|
struct mtx p_mtx; /* Mutex for hpts */
|
|
|
|
struct timeval p_mysleep; /* Our min sleep time */
|
|
|
|
uint64_t syscall_cnt;
|
|
|
|
uint64_t sleeping; /* What the actual sleep was (if sleeping) */
|
|
|
|
uint16_t p_hpts_active; /* Flag that says hpts is awake */
|
|
|
|
uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
|
|
|
|
uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
|
|
|
|
uint32_t p_runningslot; /* Current tick we are at if we are running */
|
|
|
|
uint32_t p_prev_slot; /* Previous slot we were on */
|
|
|
|
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
|
|
|
|
uint32_t p_nxt_slot; /* The next slot outside the current range of
|
|
|
|
* slots that the hpts is running on. */
|
|
|
|
int32_t p_on_queue_cnt; /* Count on queue in this hpts */
|
|
|
|
uint32_t p_lasttick; /* Last tick before the current one */
|
|
|
|
uint8_t p_direct_wake :1, /* boolean */
|
|
|
|
p_on_min_sleep:1, /* boolean */
|
|
|
|
p_hpts_wake_scheduled:1, /* boolean */
|
|
|
|
p_avail:5;
|
|
|
|
uint8_t p_fill[3]; /* Fill to 32 bits */
|
|
|
|
/* Cache line 0x40 */
|
|
|
|
TAILQ_HEAD(, inpcb) p_dropq; /* Delayed drop queue */
|
|
|
|
struct hptsh {
|
|
|
|
TAILQ_HEAD(, inpcb) head;
|
|
|
|
uint32_t count;
|
|
|
|
uint32_t gencnt;
|
|
|
|
} *p_hptss; /* Hptsi wheel */
|
|
|
|
uint32_t p_dropq_cnt; /* Count on drop queue */
|
|
|
|
uint32_t p_dropq_gencnt;
|
|
|
|
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
|
|
|
|
* of 255ms */
|
|
|
|
uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
|
|
|
|
uint32_t saved_lasttick; /* for logging */
|
|
|
|
uint32_t saved_curtick; /* for logging */
|
|
|
|
uint32_t saved_curslot; /* for logging */
|
|
|
|
uint32_t saved_prev_slot; /* for logging */
|
|
|
|
uint32_t p_delayed_by; /* How much were we delayed by */
|
|
|
|
/* Cache line 0x80 */
|
|
|
|
struct sysctl_ctx_list hpts_ctx;
|
|
|
|
struct sysctl_oid *hpts_root;
|
|
|
|
struct intr_event *ie;
|
|
|
|
void *ie_cookie;
|
|
|
|
uint16_t p_num; /* The hpts number one per cpu */
|
|
|
|
uint16_t p_cpu; /* The hpts CPU */
|
|
|
|
/* There is extra space in here */
|
|
|
|
/* Cache line 0x100 */
|
|
|
|
struct callout co __aligned(CACHE_LINE_SIZE);
|
|
|
|
} __aligned(CACHE_LINE_SIZE);
|
|
|
|
|
|
|
|
static struct tcp_hptsi {
|
|
|
|
struct tcp_hpts_entry **rp_ent; /* Array of hptss */
|
|
|
|
uint32_t *cts_last_ran;
|
|
|
|
uint32_t rp_num_hptss; /* Number of hpts threads */
|
|
|
|
} tcp_pace;
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
|
|
|
|
#ifdef RSS
|
|
|
|
static int tcp_bind_threads = 1;
|
|
|
|
#else
|
2019-05-10 13:41:19 +00:00
|
|
|
static int tcp_bind_threads = 2;
|
2018-04-19 13:37:59 +00:00
|
|
|
#endif
|
2021-07-06 19:23:22 +00:00
|
|
|
static int tcp_use_irq_cpu = 0;
|
|
|
|
static uint32_t *cts_last_ran;
|
2019-07-10 20:40:39 +00:00
|
|
|
static int hpts_does_tp_logging = 0;
|
2021-07-06 19:23:22 +00:00
|
|
|
static int hpts_use_assigned_cpu = 1;
|
|
|
|
static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
|
2018-04-19 13:37:59 +00:00
|
|
|
static void tcp_hpts_thread(void *ctx);
|
|
|
|
static void tcp_init_hptsi(void *st);
|
|
|
|
|
|
|
|
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
|
2021-07-06 19:23:22 +00:00
|
|
|
static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
|
|
|
|
static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
|
|
|
|
static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
|
|
|
|
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2020-02-26 14:26:36 +00:00
|
|
|
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
|
|
"TCP Hpts controls");
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
|
|
|
|
"TCP Hpts statistics");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
#define timersub(tvp, uvp, vvp) \
|
|
|
|
do { \
|
|
|
|
(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
|
|
|
|
(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
|
|
|
|
if ((vvp)->tv_usec < 0) { \
|
|
|
|
(vvp)->tv_sec--; \
|
|
|
|
(vvp)->tv_usec += 1000000; \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
static int32_t tcp_hpts_precision = 120;
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
static struct hpts_domain_info {
|
2019-05-10 13:41:19 +00:00
|
|
|
int count;
|
|
|
|
int cpu[MAXCPU];
|
2021-12-02 22:45:04 +00:00
|
|
|
} hpts_domains[MAXMEMDOM];
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
enum {
|
|
|
|
IHPTS_NONE = 0,
|
|
|
|
IHPTS_ONQUEUE,
|
|
|
|
IHPTS_MOVING,
|
|
|
|
};
|
2021-12-02 21:35:14 +00:00
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
counter_u64_t hpts_hopelessly_behind;
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
|
2019-07-10 20:40:39 +00:00
|
|
|
&hpts_hopelessly_behind,
|
|
|
|
"Number of times hpts could not catch up and was behind hopelessly");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
counter_u64_t hpts_loops;
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
|
2018-04-19 13:37:59 +00:00
|
|
|
&hpts_loops, "Number of times hpts had to loop to catch up");
|
|
|
|
|
|
|
|
counter_u64_t back_tosleep;
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
|
2018-04-19 13:37:59 +00:00
|
|
|
&back_tosleep, "Number of times hpts found no tcbs");
|
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
counter_u64_t combined_wheel_wrap;
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
|
2019-07-10 20:40:39 +00:00
|
|
|
&combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
counter_u64_t wheel_wrap;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
|
2019-07-10 20:40:39 +00:00
|
|
|
&wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
counter_u64_t hpts_direct_call;
|
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
|
|
|
|
&hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");
|
|
|
|
|
|
|
|
counter_u64_t hpts_wake_timeout;
|
|
|
|
|
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
|
|
|
|
&hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");
|
|
|
|
|
|
|
|
counter_u64_t hpts_direct_awakening;
|
|
|
|
|
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
|
|
|
|
&hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
counter_u64_t hpts_back_tosleep;
|
|
|
|
|
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
|
|
|
|
&hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
|
|
|
|
|
|
|
|
counter_u64_t cpu_uses_flowid;
|
|
|
|
counter_u64_t cpu_uses_random;
|
|
|
|
|
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
|
|
|
|
&cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
|
|
|
|
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
|
|
|
|
&cpu_uses_random, "Number of times when setting cpuid we used the a random value");
|
|
|
|
|
|
|
|
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
|
|
|
|
TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
|
|
|
|
&tcp_bind_threads, 2,
|
|
|
|
"Thread Binding tunable");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
|
|
|
|
&tcp_use_irq_cpu, 0,
|
|
|
|
"Use of irq CPU tunable");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
|
|
|
|
&tcp_hpts_precision, 120,
|
|
|
|
"Value for PRE() precision of callout");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
|
|
|
|
&conn_cnt_thresh, 0,
|
|
|
|
"How many connections (below) make us use the callout based mechanism");
|
2019-07-10 20:40:39 +00:00
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
|
|
|
|
&hpts_does_tp_logging, 0,
|
|
|
|
"Do we add to any tp that has logging on pacer logs");
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
|
|
|
|
&hpts_use_assigned_cpu, 0,
|
|
|
|
"Do we start any hpts timer on the assigned cpu?");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
|
|
|
|
&hpts_uses_oldest, OLDEST_THRESHOLD,
|
|
|
|
"Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
|
|
|
|
&dynamic_min_sleep, 250,
|
|
|
|
"What is the dynamic minsleep value?");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
|
|
|
|
&dynamic_max_sleep, 5000,
|
|
|
|
"What is the dynamic maxsleep value?");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
|
|
|
|
static int32_t max_pacer_loops = 10;
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
|
|
|
|
&max_pacer_loops, 10,
|
|
|
|
"What is the maximum number of times the pacer will loop trying to catch up");
|
|
|
|
|
|
|
|
#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
|
|
|
|
|
|
|
|
static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
|
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
uint32_t new;
|
|
|
|
|
|
|
|
new = hpts_sleep_max;
|
|
|
|
error = sysctl_handle_int(oidp, &new, 0, req);
|
|
|
|
if (error == 0 && req->newptr) {
|
2021-07-06 19:23:22 +00:00
|
|
|
if ((new < dynamic_min_sleep) ||
|
2020-02-12 13:07:09 +00:00
|
|
|
(new > HPTS_MAX_SLEEP_ALLOWED))
|
2019-07-10 20:40:39 +00:00
|
|
|
error = EINVAL;
|
|
|
|
else
|
|
|
|
hpts_sleep_max = new;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
static int
|
|
|
|
sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
uint32_t new;
|
|
|
|
|
|
|
|
new = tcp_min_hptsi_time;
|
|
|
|
error = sysctl_handle_int(oidp, &new, 0, req);
|
|
|
|
if (error == 0 && req->newptr) {
|
|
|
|
if (new < LOWEST_SLEEP_ALLOWED)
|
|
|
|
error = EINVAL;
|
|
|
|
else
|
|
|
|
tcp_min_hptsi_time = new;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
|
2018-04-19 13:37:59 +00:00
|
|
|
&hpts_sleep_max, 0,
|
2019-07-10 20:40:39 +00:00
|
|
|
&sysctl_net_inet_tcp_hpts_max_sleep, "IU",
|
|
|
|
"Maximum time hpts will sleep");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
|
|
|
|
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
|
2018-04-19 13:37:59 +00:00
|
|
|
&tcp_min_hptsi_time, 0,
|
2021-07-06 19:23:22 +00:00
|
|
|
&sysctl_net_inet_tcp_hpts_min_sleep, "IU",
|
2018-04-19 13:37:59 +00:00
|
|
|
"The minimum time the hpts must sleep before processing more slots");
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
|
|
|
|
static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
|
|
|
|
static int tcp_hpts_no_wake_over_thresh = 1;
|
|
|
|
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
|
|
|
|
&ticks_indicate_more_sleep, 0,
|
|
|
|
"If we only process this many or less on a timeout, we need longer sleep on the next callout");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
|
|
|
|
&ticks_indicate_less_sleep, 0,
|
|
|
|
"If we process this many or more on a timeout, we need less sleep on the next callout");
|
|
|
|
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
|
|
|
|
&tcp_hpts_no_wake_over_thresh, 0,
|
|
|
|
"When we are over the threshold on the pacer do we prohibit wakeups?");
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
static void
|
2019-07-10 20:40:39 +00:00
|
|
|
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
|
2021-07-06 19:23:22 +00:00
|
|
|
int slots_to_run, int idx, int from_callout)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2019-07-10 20:40:39 +00:00
|
|
|
union tcp_log_stackspecific log;
|
2021-07-06 19:23:22 +00:00
|
|
|
/*
|
|
|
|
* Unused logs are
|
|
|
|
* 64 bit - delRate, rttProp, bw_inuse
|
|
|
|
* 16 bit - cwnd_gain
|
|
|
|
* 8 bit - bbr_state, bbr_substate, inhpts, ininput;
|
|
|
|
*/
|
2019-07-10 20:40:39 +00:00
|
|
|
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
|
|
|
|
log.u_bbr.flex1 = hpts->p_nxt_slot;
|
|
|
|
log.u_bbr.flex2 = hpts->p_cur_slot;
|
|
|
|
log.u_bbr.flex3 = hpts->p_prev_slot;
|
|
|
|
log.u_bbr.flex4 = idx;
|
|
|
|
log.u_bbr.flex5 = hpts->p_curtick;
|
|
|
|
log.u_bbr.flex6 = hpts->p_on_queue_cnt;
|
2021-07-06 19:23:22 +00:00
|
|
|
log.u_bbr.flex7 = hpts->p_cpu;
|
|
|
|
log.u_bbr.flex8 = (uint8_t)from_callout;
|
|
|
|
log.u_bbr.inflight = slots_to_run;
|
2019-07-10 20:40:39 +00:00
|
|
|
log.u_bbr.applimited = hpts->overidden_sleep;
|
|
|
|
log.u_bbr.delivered = hpts->saved_curtick;
|
|
|
|
log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
|
|
|
|
log.u_bbr.epoch = hpts->saved_curslot;
|
|
|
|
log.u_bbr.lt_epoch = hpts->saved_prev_slot;
|
|
|
|
log.u_bbr.pkts_out = hpts->p_delayed_by;
|
|
|
|
log.u_bbr.lost = hpts->p_hpts_sleep_time;
|
2021-07-06 19:23:22 +00:00
|
|
|
log.u_bbr.pacing_gain = hpts->p_cpu;
|
|
|
|
log.u_bbr.pkt_epoch = hpts->p_runningslot;
|
|
|
|
log.u_bbr.use_lt_bw = 1;
|
2019-07-10 20:40:39 +00:00
|
|
|
TCP_LOG_EVENTP(tp, NULL,
|
|
|
|
&tp->t_inpcb->inp_socket->so_rcv,
|
|
|
|
&tp->t_inpcb->inp_socket->so_snd,
|
|
|
|
BBR_LOG_HPTSDIAG, 0,
|
|
|
|
0, &log, false, tv);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-07-06 19:23:22 +00:00
|
|
|
tcp_wakehpts(struct tcp_hpts_entry *hpts)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-07-06 19:23:22 +00:00
|
|
|
HPTS_MTX_ASSERT(hpts);
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
|
|
|
|
hpts->p_direct_wake = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (hpts->p_hpts_wake_scheduled == 0) {
|
|
|
|
hpts->p_hpts_wake_scheduled = 1;
|
|
|
|
swi_sched(hpts->ie_cookie, 0);
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts_timeout_swi(void *arg)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-07-06 19:23:22 +00:00
|
|
|
struct tcp_hpts_entry *hpts;
|
|
|
|
|
|
|
|
hpts = (struct tcp_hpts_entry *)arg;
|
|
|
|
swi_sched(hpts->ie_cookie, 0);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
static void
|
|
|
|
inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-12-02 22:45:04 +00:00
|
|
|
struct hptsh *hptsh;
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
2021-12-02 21:35:14 +00:00
|
|
|
HPTS_MTX_ASSERT(hpts);
|
2021-12-02 22:45:04 +00:00
|
|
|
MPASS(hpts->p_cpu == inp->inp_hpts_cpu);
|
|
|
|
MPASS(!(inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)));
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
hptsh = &hpts->p_hptss[inp->inp_hptsslot];
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
if (inp->inp_in_hpts == IHPTS_NONE) {
|
|
|
|
inp->inp_in_hpts = IHPTS_ONQUEUE;
|
|
|
|
in_pcbref(inp);
|
|
|
|
} else if (inp->inp_in_hpts == IHPTS_MOVING) {
|
|
|
|
inp->inp_in_hpts = IHPTS_ONQUEUE;
|
|
|
|
} else
|
|
|
|
MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
|
|
|
|
inp->inp_hpts_gencnt = hptsh->gencnt;
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_INSERT_TAIL(&hptsh->head, inp, inp_hpts);
|
|
|
|
hptsh->count++;
|
|
|
|
hpts->p_on_queue_cnt++;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
static struct tcp_hpts_entry *
|
2021-12-02 21:35:14 +00:00
|
|
|
tcp_hpts_lock(struct inpcb *inp)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
|
|
|
struct tcp_hpts_entry *hpts;
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
INP_LOCK_ASSERT(inp);
|
|
|
|
|
|
|
|
hpts = tcp_pace.rp_ent[inp->inp_hpts_cpu];
|
|
|
|
HPTS_LOCK(hpts);
|
|
|
|
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
return (hpts);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
static struct tcp_hpts_entry *
|
|
|
|
tcp_dropq_lock(struct inpcb *inp)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-12-02 21:35:14 +00:00
|
|
|
struct tcp_hpts_entry *hpts;
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
INP_LOCK_ASSERT(inp);
|
|
|
|
|
|
|
|
hpts = tcp_pace.rp_ent[inp->inp_dropq_cpu];
|
|
|
|
HPTS_LOCK(hpts);
|
|
|
|
|
2021-12-02 21:35:14 +00:00
|
|
|
return (hpts);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-12-02 22:45:04 +00:00
|
|
|
inp_hpts_release(struct inpcb *inp)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-12-02 22:45:04 +00:00
|
|
|
bool released __diagused;
|
2021-12-02 18:48:48 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
inp->inp_in_hpts = IHPTS_NONE;
|
|
|
|
released = in_pcbrele_wlocked(inp);
|
|
|
|
MPASS(released == false);
|
2021-12-02 21:35:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-12-02 22:45:04 +00:00
|
|
|
tcp_dropq_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp)
|
2021-12-02 21:35:14 +00:00
|
|
|
{
|
2021-12-02 22:45:04 +00:00
|
|
|
bool released __diagused;
|
2021-12-02 18:48:48 +00:00
|
|
|
|
2021-12-02 21:35:14 +00:00
|
|
|
HPTS_MTX_ASSERT(hpts);
|
2021-12-02 22:45:04 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
|
|
|
|
|
|
|
if (inp->inp_in_dropq != IHPTS_ONQUEUE)
|
|
|
|
return;
|
|
|
|
|
|
|
|
MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
|
|
|
|
if (__predict_true(inp->inp_dropq_gencnt == hpts->p_dropq_gencnt)) {
|
|
|
|
TAILQ_REMOVE(&hpts->p_dropq, inp, inp_dropq);
|
|
|
|
MPASS(hpts->p_dropq_cnt > 0);
|
|
|
|
hpts->p_dropq_cnt--;
|
|
|
|
inp->inp_in_dropq = IHPTS_NONE;
|
|
|
|
released = in_pcbrele_wlocked(inp);
|
|
|
|
MPASS(released == false);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* tcp_delayed_drop() now owns the TAILQ head of this inp.
|
|
|
|
* Can't TAILQ_REMOVE, just mark it.
|
|
|
|
*/
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
struct inpcb *tmp;
|
|
|
|
|
|
|
|
TAILQ_FOREACH(tmp, &hpts->p_dropq, inp_dropq)
|
|
|
|
MPASS(tmp != inp);
|
|
|
|
#endif
|
|
|
|
inp->inp_in_dropq = IHPTS_MOVING;
|
2021-12-02 21:35:14 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called normally with the INP_LOCKED but it
|
|
|
|
* does not matter, the hpts lock is the key
|
|
|
|
* but the lock order allows us to hold the
|
|
|
|
* INP lock and then get the hpts lock.
|
|
|
|
*
|
|
|
|
* Valid values in the flags are
|
|
|
|
* HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
|
2021-12-02 22:45:04 +00:00
|
|
|
* HPTS_REMOVE_DROPQ - remove from the drop queue of the hpts.
|
2020-02-12 13:07:09 +00:00
|
|
|
* Note that you can use one or both values together
|
2019-07-10 20:40:39 +00:00
|
|
|
* and get two actions.
|
2018-04-19 13:37:59 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
|
|
|
|
{
|
|
|
|
struct tcp_hpts_entry *hpts;
|
2021-12-02 22:45:04 +00:00
|
|
|
struct hptsh *hptsh;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
INP_WLOCK_ASSERT(inp);
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
if (flags & HPTS_REMOVE_DROPQ) {
|
|
|
|
hpts = tcp_dropq_lock(inp);
|
|
|
|
tcp_dropq_remove(hpts, inp);
|
2018-04-19 13:37:59 +00:00
|
|
|
mtx_unlock(&hpts->p_mtx);
|
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
MPASS(flags & HPTS_REMOVE_OUTPUT);
|
|
|
|
|
|
|
|
hpts = tcp_hpts_lock(inp);
|
|
|
|
if (inp->inp_in_hpts == IHPTS_ONQUEUE) {
|
|
|
|
hptsh = &hpts->p_hptss[inp->inp_hptsslot];
|
|
|
|
inp->inp_hpts_request = 0;
|
|
|
|
if (__predict_true(inp->inp_hpts_gencnt == hptsh->gencnt)) {
|
|
|
|
TAILQ_REMOVE(&hptsh->head, inp, inp_hpts);
|
|
|
|
MPASS(hptsh->count > 0);
|
|
|
|
hptsh->count--;
|
|
|
|
MPASS(hpts->p_on_queue_cnt > 0);
|
|
|
|
hpts->p_on_queue_cnt--;
|
|
|
|
inp_hpts_release(inp);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* tcp_hptsi() now owns the TAILQ head of this inp.
|
|
|
|
* Can't TAILQ_REMOVE, just mark it.
|
|
|
|
*/
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
struct inpcb *tmp;
|
|
|
|
|
|
|
|
TAILQ_FOREACH(tmp, &hptsh->head, inp_hpts)
|
|
|
|
MPASS(tmp != inp);
|
|
|
|
#endif
|
|
|
|
inp->inp_in_hpts = IHPTS_MOVING;
|
|
|
|
inp->inp_hptsslot = -1;
|
|
|
|
}
|
|
|
|
} else if (inp->inp_in_hpts == IHPTS_MOVING) {
|
|
|
|
/*
|
|
|
|
* Handle a special race condition:
|
|
|
|
* tcp_hptsi() moves inpcb to detached tailq
|
|
|
|
* tcp_hpts_remove() marks as IHPTS_MOVING, slot = -1
|
|
|
|
* tcp_hpts_insert() sets slot to a meaningful value
|
|
|
|
* tcp_hpts_remove() again (we are here!), then in_pcbdrop()
|
|
|
|
* tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED
|
|
|
|
*/
|
|
|
|
inp->inp_hptsslot = -1;
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
HPTS_UNLOCK(hpts);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
tcp_in_hpts(struct inpcb *inp)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (inp->inp_in_hpts == IHPTS_ONQUEUE);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts_slot(uint32_t wheel_slot, uint32_t plus)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2019-07-10 20:40:39 +00:00
|
|
|
/*
|
|
|
|
* Given a slot on the wheel, what slot
|
|
|
|
* is that plus ticks out?
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
|
|
|
|
return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
tick_to_wheel(uint32_t cts_in_wticks)
|
|
|
|
{
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2021-07-06 19:23:22 +00:00
|
|
|
* Given a timestamp in ticks (so by
|
|
|
|
* default to get it to a real time one
|
|
|
|
* would multiply by 10.. i.e the number
|
|
|
|
* of ticks in a slot) map it to our limited
|
|
|
|
* space wheel.
|
2019-07-10 20:40:39 +00:00
|
|
|
*/
|
|
|
|
return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts_slots_diff(int prev_slot, int slot_now)
|
2019-07-10 20:40:39 +00:00
|
|
|
{
|
|
|
|
/*
|
2021-07-06 19:23:22 +00:00
|
|
|
* Given two slots that are someplace
|
2019-07-10 20:40:39 +00:00
|
|
|
* on our wheel. How far are they apart?
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
if (slot_now > prev_slot)
|
|
|
|
return (slot_now - prev_slot);
|
|
|
|
else if (slot_now == prev_slot)
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
|
|
|
* Special case, same means we can go all of our
|
2019-07-10 20:40:39 +00:00
|
|
|
* wheel less one slot.
|
|
|
|
*/
|
|
|
|
return (NUM_OF_HPTSI_SLOTS - 1);
|
|
|
|
else
|
2021-07-06 19:23:22 +00:00
|
|
|
return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-07-06 19:23:22 +00:00
|
|
|
* Given a slot on the wheel that is the current time
|
|
|
|
* mapped to the wheel (wheel_slot), what is the maximum
|
2019-07-10 20:40:39 +00:00
|
|
|
* distance forward that can be obtained without
|
2021-07-06 19:23:22 +00:00
|
|
|
* wrapping past either prev_slot or running_slot
|
2019-07-10 20:40:39 +00:00
|
|
|
* depending on the htps state? Also if passed
|
2021-07-06 19:23:22 +00:00
|
|
|
* a uint32_t *, fill it with the slot location.
|
2019-07-10 20:40:39 +00:00
|
|
|
*
|
|
|
|
* Note if you do not give this function the current
|
2021-07-06 19:23:22 +00:00
|
|
|
* time (that you think it is) mapped to the wheel slot
|
2019-07-10 20:40:39 +00:00
|
|
|
* then the results will not be what you expect and
|
|
|
|
* could lead to invalid inserts.
|
|
|
|
*/
|
|
|
|
static inline int32_t
|
2021-07-06 19:23:22 +00:00
|
|
|
max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
|
2019-07-10 20:40:39 +00:00
|
|
|
{
|
2021-07-06 19:23:22 +00:00
|
|
|
uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
|
2019-07-10 20:40:39 +00:00
|
|
|
|
|
|
|
if ((hpts->p_hpts_active == 1) &&
|
|
|
|
(hpts->p_wheel_complete == 0)) {
|
2021-07-06 19:23:22 +00:00
|
|
|
end_slot = hpts->p_runningslot;
|
2019-07-10 20:40:39 +00:00
|
|
|
/* Back up one tick */
|
2021-07-06 19:23:22 +00:00
|
|
|
if (end_slot == 0)
|
|
|
|
end_slot = NUM_OF_HPTSI_SLOTS - 1;
|
2019-07-10 20:40:39 +00:00
|
|
|
else
|
2021-07-06 19:23:22 +00:00
|
|
|
end_slot--;
|
|
|
|
if (target_slot)
|
|
|
|
*target_slot = end_slot;
|
2019-07-10 20:40:39 +00:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For the case where we are
|
|
|
|
* not active, or we have
|
|
|
|
* completed the pass over
|
|
|
|
* the wheel, we can use the
|
|
|
|
* prev tick and subtract one from it. This puts us
|
|
|
|
* as far out as possible on the wheel.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
end_slot = hpts->p_prev_slot;
|
|
|
|
if (end_slot == 0)
|
|
|
|
end_slot = NUM_OF_HPTSI_SLOTS - 1;
|
2019-07-10 20:40:39 +00:00
|
|
|
else
|
2021-07-06 19:23:22 +00:00
|
|
|
end_slot--;
|
|
|
|
if (target_slot)
|
|
|
|
*target_slot = end_slot;
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
|
|
|
* Now we have close to the full wheel left minus the
|
2019-07-10 20:40:39 +00:00
|
|
|
* time it has been since the pacer went to sleep. Note
|
|
|
|
* that wheel_tick, passed in, should be the current time
|
|
|
|
* from the perspective of the caller, mapped to the wheel.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
if (hpts->p_prev_slot != wheel_slot)
|
|
|
|
dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
|
2019-07-10 20:40:39 +00:00
|
|
|
else
|
|
|
|
dis_to_travel = 1;
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
|
|
|
* dis_to_travel in this case is the space from when the
|
2021-07-06 19:23:22 +00:00
|
|
|
* pacer stopped (p_prev_slot) and where our wheel_slot
|
2020-02-12 13:07:09 +00:00
|
|
|
* is now. To know how many slots we can put it in we
|
2019-07-10 20:40:39 +00:00
|
|
|
* subtract from the wheel size. We would not want
|
|
|
|
* to place something after p_prev_slot or it will
|
|
|
|
* get ran too soon.
|
|
|
|
*/
|
|
|
|
return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
|
|
|
|
}
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2021-07-06 19:23:22 +00:00
|
|
|
* So how many slots are open between p_runningslot -> p_cur_slot
|
2019-07-10 20:40:39 +00:00
|
|
|
* that is what is currently un-available for insertion. Special
|
|
|
|
* case when we are at the last slot, this gets 1, so that
|
|
|
|
* the answer to how many slots are available is all but 1.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
if (hpts->p_runningslot == hpts->p_cur_slot)
|
2019-07-10 20:40:39 +00:00
|
|
|
dis_to_travel = 1;
|
|
|
|
else
|
2021-07-06 19:23:22 +00:00
|
|
|
dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* How long has the pacer been running?
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
if (hpts->p_cur_slot != wheel_slot) {
|
2019-07-10 20:40:39 +00:00
|
|
|
/* The pacer is a bit late */
|
2021-07-06 19:23:22 +00:00
|
|
|
pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
|
2019-07-10 20:40:39 +00:00
|
|
|
} else {
|
|
|
|
/* The pacer is right on time, now == pacers start time */
|
|
|
|
pacer_to_now = 0;
|
|
|
|
}
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* To get the number left we can insert into we simply
|
|
|
|
* subract the distance the pacer has to run from how
|
|
|
|
* many slots there are.
|
|
|
|
*/
|
|
|
|
avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
|
|
|
* Now how many of those we will eat due to the pacer's
|
|
|
|
* time (p_cur_slot) of start being behind the
|
2021-07-06 19:23:22 +00:00
|
|
|
* real time (wheel_slot)?
|
2019-07-10 20:40:39 +00:00
|
|
|
*/
|
|
|
|
if (avail_on_wheel <= pacer_to_now) {
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* Wheel wrap, we can't fit on the wheel, that
|
|
|
|
* is unusual the system must be way overloaded!
|
2021-07-06 19:23:22 +00:00
|
|
|
* Insert into the assured slot, and return special
|
2019-07-10 20:40:39 +00:00
|
|
|
* "0".
|
|
|
|
*/
|
|
|
|
counter_u64_add(combined_wheel_wrap, 1);
|
2021-07-06 19:23:22 +00:00
|
|
|
*target_slot = hpts->p_nxt_slot;
|
2019-07-10 20:40:39 +00:00
|
|
|
return (0);
|
|
|
|
} else {
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* We know how many slots are open
|
|
|
|
* on the wheel (the reverse of what
|
|
|
|
* is left to run. Take away the time
|
2021-07-06 19:23:22 +00:00
|
|
|
* the pacer started to now (wheel_slot)
|
2019-07-10 20:40:39 +00:00
|
|
|
* and that tells you how many slots are
|
|
|
|
* open that can be inserted into that won't
|
|
|
|
* be touched by the pacer until later.
|
|
|
|
*/
|
|
|
|
return (avail_on_wheel - pacer_to_now);
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
static void
|
|
|
|
check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
|
|
|
|
{
|
|
|
|
/*
|
2020-02-12 13:07:09 +00:00
|
|
|
* Sanity checks for the pacer with invariants
|
2019-07-10 20:40:39 +00:00
|
|
|
* on insert.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
|
|
|
|
("hpts:%p inp:%p slot:%d > max",
|
|
|
|
hpts, inp, inp_hptsslot));
|
2019-07-10 20:40:39 +00:00
|
|
|
if ((hpts->p_hpts_active) &&
|
|
|
|
(hpts->p_wheel_complete == 0)) {
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* If the pacer is processing a arc
|
|
|
|
* of the wheel, we need to make
|
|
|
|
* sure we are not inserting within
|
|
|
|
* that arc.
|
|
|
|
*/
|
|
|
|
int distance, yet_to_run;
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
|
|
|
|
if (hpts->p_runningslot != hpts->p_cur_slot)
|
|
|
|
yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
|
2019-07-10 20:40:39 +00:00
|
|
|
else
|
|
|
|
yet_to_run = 0; /* processing last slot */
|
2021-07-06 19:23:22 +00:00
|
|
|
KASSERT(yet_to_run <= distance,
|
|
|
|
("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
|
|
|
|
hpts, inp, inp_hptsslot,
|
|
|
|
distance, yet_to_run,
|
|
|
|
hpts->p_runningslot, hpts->p_cur_slot));
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
uint32_t
|
|
|
|
tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-12-02 22:45:04 +00:00
|
|
|
struct tcp_hpts_entry *hpts;
|
|
|
|
struct timeval tv;
|
|
|
|
uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
|
|
|
|
int32_t wheel_slot, maxslots;
|
2021-07-06 19:23:22 +00:00
|
|
|
int cpu;
|
2021-12-02 22:45:04 +00:00
|
|
|
bool need_wakeup = false;
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
|
|
|
MPASS(!tcp_in_hpts(inp));
|
|
|
|
MPASS(!(inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We now return the next-slot the hpts will be on, beyond its
|
|
|
|
* current run (if up) or where it was when it stopped if it is
|
|
|
|
* sleeping.
|
|
|
|
*/
|
|
|
|
hpts = tcp_hpts_lock(inp);
|
|
|
|
microuptime(&tv);
|
2018-04-19 13:37:59 +00:00
|
|
|
if (diag) {
|
|
|
|
memset(diag, 0, sizeof(struct hpts_diag));
|
|
|
|
diag->p_hpts_active = hpts->p_hpts_active;
|
2019-07-10 20:40:39 +00:00
|
|
|
diag->p_prev_slot = hpts->p_prev_slot;
|
2021-07-06 19:23:22 +00:00
|
|
|
diag->p_runningslot = hpts->p_runningslot;
|
2018-04-19 13:37:59 +00:00
|
|
|
diag->p_nxt_slot = hpts->p_nxt_slot;
|
|
|
|
diag->p_cur_slot = hpts->p_cur_slot;
|
2019-07-10 20:40:39 +00:00
|
|
|
diag->p_curtick = hpts->p_curtick;
|
|
|
|
diag->p_lasttick = hpts->p_lasttick;
|
2018-04-19 13:37:59 +00:00
|
|
|
diag->slot_req = slot;
|
2019-07-10 20:40:39 +00:00
|
|
|
diag->p_on_min_sleep = hpts->p_on_min_sleep;
|
|
|
|
diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
if (slot == 0) {
|
2021-12-02 22:45:04 +00:00
|
|
|
/* Ok we need to set it on the hpts in the current slot */
|
|
|
|
inp->inp_hpts_request = 0;
|
|
|
|
if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) {
|
|
|
|
/*
|
|
|
|
* A sleeping hpts we want in next slot to run
|
|
|
|
* note that in this state p_prev_slot == p_cur_slot
|
|
|
|
*/
|
|
|
|
inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
|
|
|
|
if ((hpts->p_on_min_sleep == 0) &&
|
|
|
|
(hpts->p_hpts_active == 0))
|
|
|
|
need_wakeup = true;
|
|
|
|
} else
|
|
|
|
inp->inp_hptsslot = hpts->p_runningslot;
|
|
|
|
if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))
|
|
|
|
inp_hpts_insert(inp, hpts);
|
|
|
|
if (need_wakeup) {
|
|
|
|
/*
|
|
|
|
* Activate the hpts if it is sleeping and its
|
|
|
|
* timeout is not 1.
|
|
|
|
*/
|
|
|
|
hpts->p_direct_wake = 1;
|
|
|
|
tcp_wakehpts(hpts);
|
|
|
|
}
|
|
|
|
slot_on = hpts->p_nxt_slot;
|
|
|
|
HPTS_UNLOCK(hpts);
|
|
|
|
|
|
|
|
return (slot_on);
|
2021-07-06 19:23:22 +00:00
|
|
|
}
|
|
|
|
/* Get the current time relative to the wheel */
|
2021-12-02 22:45:04 +00:00
|
|
|
wheel_cts = tcp_tv_to_hptstick(&tv);
|
2021-07-06 19:23:22 +00:00
|
|
|
/* Map it onto the wheel */
|
|
|
|
wheel_slot = tick_to_wheel(wheel_cts);
|
|
|
|
/* Now what's the max we can place it at? */
|
|
|
|
maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
|
|
|
|
if (diag) {
|
|
|
|
diag->wheel_slot = wheel_slot;
|
|
|
|
diag->maxslots = maxslots;
|
|
|
|
diag->wheel_cts = wheel_cts;
|
|
|
|
}
|
|
|
|
if (maxslots == 0) {
|
|
|
|
/* The pacer is in a wheel wrap behind, yikes! */
|
|
|
|
if (slot > 1) {
|
|
|
|
/*
|
|
|
|
* Reduce by 1 to prevent a forever loop in
|
|
|
|
* case something else is wrong. Note this
|
|
|
|
* probably does not hurt because the pacer
|
|
|
|
* if its true is so far behind we will be
|
|
|
|
* > 1second late calling anyway.
|
|
|
|
*/
|
|
|
|
slot--;
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
inp->inp_hptsslot = last_slot;
|
|
|
|
inp->inp_hpts_request = slot;
|
|
|
|
} else if (maxslots >= slot) {
|
|
|
|
/* It all fits on the wheel */
|
|
|
|
inp->inp_hpts_request = 0;
|
|
|
|
inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
|
|
|
|
} else {
|
|
|
|
/* It does not fit */
|
|
|
|
inp->inp_hpts_request = slot - maxslots;
|
|
|
|
inp->inp_hptsslot = last_slot;
|
|
|
|
}
|
|
|
|
if (diag) {
|
|
|
|
diag->slot_remaining = inp->inp_hpts_request;
|
|
|
|
diag->inp_hptsslot = inp->inp_hptsslot;
|
|
|
|
}
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
|
|
|
|
#endif
|
2021-12-02 22:45:04 +00:00
|
|
|
if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))
|
|
|
|
inp_hpts_insert(inp, hpts);
|
2021-07-06 19:23:22 +00:00
|
|
|
if ((hpts->p_hpts_active == 0) &&
|
|
|
|
(inp->inp_hpts_request == 0) &&
|
|
|
|
(hpts->p_on_min_sleep == 0)) {
|
|
|
|
/*
|
|
|
|
* The hpts is sleeping and NOT on a minimum
|
|
|
|
* sleep time, we need to figure out where
|
|
|
|
* it will wake up at and if we need to reschedule
|
|
|
|
* its time-out.
|
|
|
|
*/
|
|
|
|
uint32_t have_slept, yet_to_sleep;
|
|
|
|
|
|
|
|
/* Now do we need to restart the hpts's timer? */
|
|
|
|
have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
|
|
|
|
if (have_slept < hpts->p_hpts_sleep_time)
|
|
|
|
yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
|
|
|
|
else {
|
|
|
|
/* We are over-due */
|
|
|
|
yet_to_sleep = 0;
|
|
|
|
need_wakeup = 1;
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
|
|
|
if (diag) {
|
2021-07-06 19:23:22 +00:00
|
|
|
diag->have_slept = have_slept;
|
|
|
|
diag->yet_to_sleep = yet_to_sleep;
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
if (yet_to_sleep &&
|
|
|
|
(yet_to_sleep > slot)) {
|
2018-04-19 13:37:59 +00:00
|
|
|
/*
|
2021-07-06 19:23:22 +00:00
|
|
|
* We need to reschedule the hpts's time-out.
|
2018-04-19 13:37:59 +00:00
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->p_hpts_sleep_time = slot;
|
|
|
|
need_new_to = slot * HPTS_TICKS_PER_SLOT;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Now how far is the hpts sleeping to? if active is 1, its
|
|
|
|
* up and ticking we do nothing, otherwise we may need to
|
|
|
|
* reschedule its callout if need_new_to is set from above.
|
|
|
|
*/
|
|
|
|
if (need_wakeup) {
|
|
|
|
hpts->p_direct_wake = 1;
|
|
|
|
tcp_wakehpts(hpts);
|
|
|
|
if (diag) {
|
|
|
|
diag->need_new_to = 0;
|
|
|
|
diag->co_ret = 0xffff0000;
|
|
|
|
}
|
|
|
|
} else if (need_new_to) {
|
|
|
|
int32_t co_ret;
|
|
|
|
struct timeval tv;
|
|
|
|
sbintime_t sb;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
tv.tv_sec = 0;
|
|
|
|
tv.tv_usec = 0;
|
|
|
|
while (need_new_to > HPTS_USEC_IN_SEC) {
|
|
|
|
tv.tv_sec++;
|
|
|
|
need_new_to -= HPTS_USEC_IN_SEC;
|
|
|
|
}
|
|
|
|
tv.tv_usec = need_new_to;
|
|
|
|
sb = tvtosbt(tv);
|
|
|
|
cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
|
|
|
|
co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
|
|
|
|
hpts_timeout_swi, hpts, cpu,
|
|
|
|
(C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
|
|
|
|
if (diag) {
|
|
|
|
diag->need_new_to = need_new_to;
|
|
|
|
diag->co_ret = co_ret;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
}
|
2021-12-02 21:35:14 +00:00
|
|
|
slot_on = hpts->p_nxt_slot;
|
2021-12-02 22:45:04 +00:00
|
|
|
HPTS_UNLOCK(hpts);
|
2021-12-02 21:35:14 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
return (slot_on);
|
2021-12-02 21:35:14 +00:00
|
|
|
}
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
void
|
|
|
|
tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
|
|
|
struct tcp_hpts_entry *hpts;
|
2021-12-02 22:45:04 +00:00
|
|
|
struct tcpcb *tp = intotcpcb(inp);
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
|
|
|
inp->inp_hpts_drop_reas = reason;
|
|
|
|
if (inp->inp_in_dropq != IHPTS_NONE)
|
|
|
|
return;
|
|
|
|
hpts = tcp_dropq_lock(tp->t_inpcb);
|
|
|
|
MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
|
2021-12-02 18:48:48 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_INSERT_TAIL(&hpts->p_dropq, inp, inp_dropq);
|
|
|
|
inp->inp_in_dropq = IHPTS_ONQUEUE;
|
|
|
|
inp->inp_dropq_gencnt = hpts->p_dropq_gencnt;
|
|
|
|
hpts->p_dropq_cnt++;
|
|
|
|
in_pcbref(inp);
|
2021-12-02 18:48:48 +00:00
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
if ((hpts->p_hpts_active == 0) && (hpts->p_on_min_sleep == 0)){
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts->p_direct_wake = 1;
|
2021-07-06 19:23:22 +00:00
|
|
|
tcp_wakehpts(hpts);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
HPTS_UNLOCK(hpts);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
tcp_hpts: rewrite inpcb synchronization
Just trust the pcb database, that if we did in_pcbref(), no way
an inpcb can go away. And if we never put a dropped inpcb on
our queue, and tcp_discardcb() always removes an inpcb to be
dropped from the queue, then any inpcb on the queue is valid.
Now, to solve LOR between inpcb lock and HPTS queue lock do the
following trick. When we are about to process a certain time
slot, take the full queue of the head list into on stack list,
drop the HPTS lock and work on our queue. This of course opens
a race when an inpcb is being removed from the on stack queue,
which was already mentioned in comments. To address this race
introduce generation count into queues. If we want to remove
an inpcb with generation count mismatch, we can't do that, we
can only mark it with desired new time slot or -1 for remove.
Reviewed by: rrs
Differential revision: https://reviews.freebsd.org/D33026
2021-12-02 18:48:49 +00:00
|
|
|
uint16_t
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts_random_cpu(struct inpcb *inp){
|
|
|
|
/*
|
|
|
|
* No flow type set distribute the load randomly.
|
|
|
|
*/
|
|
|
|
uint16_t cpuid;
|
|
|
|
uint32_t ran;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If one has been set use it i.e. we want both in and out on the
|
|
|
|
* same hpts.
|
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
if (inp->inp_dropq_cpu_set) {
|
|
|
|
return (inp->inp_dropq_cpu);
|
2018-04-19 13:37:59 +00:00
|
|
|
} else if (inp->inp_hpts_cpu_set) {
|
|
|
|
return (inp->inp_hpts_cpu);
|
|
|
|
}
|
|
|
|
/* Nothing set use a random number */
|
|
|
|
ran = arc4random();
|
2021-07-06 19:23:22 +00:00
|
|
|
cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
|
2018-04-19 13:37:59 +00:00
|
|
|
return (cpuid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint16_t
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts_cpuid(struct inpcb *inp, int *failed)
|
2020-03-03 14:15:30 +00:00
|
|
|
{
|
2018-09-06 16:11:24 +00:00
|
|
|
u_int cpuid;
|
2020-03-03 14:15:30 +00:00
|
|
|
#if !defined(RSS) && defined(NUMA)
|
2019-05-10 13:41:19 +00:00
|
|
|
struct hpts_domain_info *di;
|
|
|
|
#endif
|
2018-04-19 13:37:59 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
*failed = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
/*
|
|
|
|
* If one has been set use it i.e. we want both in and out on the
|
|
|
|
* same hpts.
|
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
if (inp->inp_dropq_cpu_set) {
|
|
|
|
return (inp->inp_dropq_cpu);
|
2018-04-19 13:37:59 +00:00
|
|
|
} else if (inp->inp_hpts_cpu_set) {
|
|
|
|
return (inp->inp_hpts_cpu);
|
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
/*
|
|
|
|
* If we are using the irq cpu set by LRO or
|
|
|
|
* the driver then it overrides all other domains.
|
|
|
|
*/
|
|
|
|
if (tcp_use_irq_cpu) {
|
|
|
|
if (inp->inp_irq_cpu_set == 0) {
|
|
|
|
*failed = 1;
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
return(inp->inp_irq_cpu);
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
/* If one is set the other must be the same */
|
2020-03-03 14:15:30 +00:00
|
|
|
#ifdef RSS
|
2018-04-19 13:37:59 +00:00
|
|
|
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
|
|
|
|
if (cpuid == NETISR_CPUID_NONE)
|
|
|
|
return (hpts_random_cpu(inp));
|
|
|
|
else
|
|
|
|
return (cpuid);
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* We don't have a flowid -> cpuid mapping, so cheat and just map
|
|
|
|
* unknown cpuids to curcpu. Not the best, but apparently better
|
|
|
|
* than defaulting to swi 0.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
if (inp->inp_flowtype == M_HASHTYPE_NONE) {
|
|
|
|
counter_u64_add(cpu_uses_random, 1);
|
2019-05-10 13:41:19 +00:00
|
|
|
return (hpts_random_cpu(inp));
|
2021-07-06 19:23:22 +00:00
|
|
|
}
|
2019-05-10 13:41:19 +00:00
|
|
|
/*
|
|
|
|
* Hash to a thread based on the flowid. If we are using numa,
|
|
|
|
* then restrict the hash to the numa domain where the inp lives.
|
|
|
|
*/
|
|
|
|
#ifdef NUMA
|
|
|
|
if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) {
|
|
|
|
di = &hpts_domains[inp->inp_numa_domain];
|
|
|
|
cpuid = di->cpu[inp->inp_flowid % di->count];
|
|
|
|
} else
|
|
|
|
#endif
|
2018-04-19 13:37:59 +00:00
|
|
|
cpuid = inp->inp_flowid % mp_ncpus;
|
2021-07-06 19:23:22 +00:00
|
|
|
counter_u64_add(cpu_uses_flowid, 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
return (cpuid);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
static void
|
|
|
|
tcp_drop_in_pkts(struct tcpcb *tp)
|
|
|
|
{
|
|
|
|
struct mbuf *m, *n;
|
2020-02-12 13:07:09 +00:00
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
m = tp->t_in_pkt;
|
|
|
|
if (m)
|
|
|
|
n = m->m_nextpkt;
|
|
|
|
else
|
|
|
|
n = NULL;
|
|
|
|
tp->t_in_pkt = NULL;
|
|
|
|
while (m) {
|
|
|
|
m_freem(m);
|
|
|
|
m = n;
|
|
|
|
if (m)
|
|
|
|
n = m->m_nextpkt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
/*
|
2021-12-02 22:45:04 +00:00
|
|
|
* Delayed drop functionality is factored out into separate function,
|
|
|
|
* but logic is similar to the logic of tcp_hptsi().
|
2018-04-19 13:37:59 +00:00
|
|
|
*/
|
|
|
|
static void
|
2021-12-02 22:45:04 +00:00
|
|
|
tcp_delayed_drop(struct tcp_hpts_entry *hpts)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
|
|
|
|
struct inpcb *inp, *tmp;
|
2018-04-19 13:37:59 +00:00
|
|
|
struct tcpcb *tp;
|
|
|
|
|
|
|
|
HPTS_MTX_ASSERT(hpts);
|
2019-11-07 21:30:27 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
|
|
|
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_SWAP(&head, &hpts->p_dropq, inpcb, inp_dropq);
|
|
|
|
hpts->p_dropq_cnt = 0;
|
|
|
|
hpts->p_dropq_gencnt++;
|
|
|
|
HPTS_UNLOCK(hpts);
|
|
|
|
|
|
|
|
TAILQ_FOREACH_SAFE(inp, &head, inp_dropq, tmp) {
|
2018-04-19 13:37:59 +00:00
|
|
|
INP_WLOCK(inp);
|
2021-12-02 22:45:04 +00:00
|
|
|
MPASS(inp->inp_hpts_drop_reas != 0);
|
|
|
|
if (__predict_false(inp->inp_in_dropq == IHPTS_MOVING)) {
|
|
|
|
inp->inp_in_dropq = IHPTS_NONE;
|
|
|
|
if (in_pcbrele_wlocked(inp) == false)
|
2018-04-19 13:37:59 +00:00
|
|
|
INP_WUNLOCK(inp);
|
|
|
|
continue;
|
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
MPASS(inp->inp_in_dropq == IHPTS_ONQUEUE);
|
|
|
|
inp->inp_in_dropq = IHPTS_NONE;
|
|
|
|
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
|
|
|
|
if (in_pcbrele_wlocked(inp) == false)
|
2018-04-19 13:37:59 +00:00
|
|
|
INP_WUNLOCK(inp);
|
|
|
|
continue;
|
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
CURVNET_SET(inp->inp_vnet);
|
|
|
|
if (__predict_true((tp = intotcpcb(inp)) != NULL)) {
|
|
|
|
MPASS(tp->t_inpcb == inp);
|
2021-12-02 21:35:14 +00:00
|
|
|
tcp_drop_in_pkts(tp);
|
2021-12-02 22:45:04 +00:00
|
|
|
tp = tcp_drop(tp, inp->inp_hpts_drop_reas);
|
|
|
|
if (tp == NULL)
|
|
|
|
INP_WLOCK(inp);
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
if (in_pcbrele_wlocked(inp) == false)
|
2018-04-19 13:37:59 +00:00
|
|
|
INP_WUNLOCK(inp);
|
2019-07-10 20:40:39 +00:00
|
|
|
CURVNET_RESTORE();
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
mtx_lock(&hpts->p_mtx); /* XXXGL */
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-07-06 19:23:22 +00:00
|
|
|
tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
|
|
|
|
{
|
|
|
|
uint32_t t = 0, i, fnd = 0;
|
|
|
|
|
|
|
|
if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
|
|
|
|
/*
|
|
|
|
* Find next slot that is occupied and use that to
|
|
|
|
* be the sleep time.
|
|
|
|
*/
|
|
|
|
for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
|
2021-12-02 22:45:04 +00:00
|
|
|
if (TAILQ_EMPTY(&hpts->p_hptss[t].head) == 0) {
|
2021-07-06 19:23:22 +00:00
|
|
|
fnd = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
t = (t + 1) % NUM_OF_HPTSI_SLOTS;
|
|
|
|
}
|
|
|
|
KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
|
|
|
|
hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
|
|
|
|
} else {
|
|
|
|
/* No one on the wheel sleep for all but 400 slots or sleep max */
|
|
|
|
hpts->p_hpts_sleep_time = hpts_sleep_max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int32_t
|
|
|
|
tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
|
2018-04-19 13:37:59 +00:00
|
|
|
{
|
|
|
|
struct tcpcb *tp;
|
2021-12-02 22:45:04 +00:00
|
|
|
struct inpcb *inp;
|
2018-04-19 13:37:59 +00:00
|
|
|
struct timeval tv;
|
2021-07-06 19:23:22 +00:00
|
|
|
uint64_t total_slots_processed = 0;
|
|
|
|
int32_t slots_to_run, i, error;
|
2018-04-19 13:37:59 +00:00
|
|
|
int32_t paced_cnt = 0;
|
2019-07-10 20:40:39 +00:00
|
|
|
int32_t loop_cnt = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
int32_t did_prefetch = 0;
|
|
|
|
int32_t prefetch_ninp = 0;
|
|
|
|
int32_t prefetch_tp = 0;
|
2019-07-10 20:40:39 +00:00
|
|
|
int32_t wrap_loop_cnt = 0;
|
2021-07-06 19:23:22 +00:00
|
|
|
int32_t slot_pos_of_endpoint = 0;
|
|
|
|
int32_t orig_exit_slot;
|
|
|
|
int8_t completed_measure = 0, seen_endpoint = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
HPTS_MTX_ASSERT(hpts);
|
2019-11-07 21:30:27 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
2019-07-10 20:40:39 +00:00
|
|
|
/* record previous info for any logging */
|
|
|
|
hpts->saved_lasttick = hpts->p_lasttick;
|
|
|
|
hpts->saved_curtick = hpts->p_curtick;
|
|
|
|
hpts->saved_curslot = hpts->p_cur_slot;
|
|
|
|
hpts->saved_prev_slot = hpts->p_prev_slot;
|
|
|
|
|
|
|
|
hpts->p_lasttick = hpts->p_curtick;
|
|
|
|
hpts->p_curtick = tcp_gethptstick(&tv);
|
2021-07-06 19:23:22 +00:00
|
|
|
cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
|
|
|
|
orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
|
2019-07-10 20:40:39 +00:00
|
|
|
if ((hpts->p_on_queue_cnt == 0) ||
|
|
|
|
(hpts->p_lasttick == hpts->p_curtick)) {
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
|
|
|
* No time has yet passed,
|
2019-07-10 20:40:39 +00:00
|
|
|
* or nothing to do.
|
|
|
|
*/
|
|
|
|
hpts->p_prev_slot = hpts->p_cur_slot;
|
|
|
|
hpts->p_lasttick = hpts->p_curtick;
|
|
|
|
goto no_run;
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
again:
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts->p_wheel_complete = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
HPTS_MTX_ASSERT(hpts);
|
2021-07-06 19:23:22 +00:00
|
|
|
slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
|
|
|
|
if (((hpts->p_curtick - hpts->p_lasttick) >
|
|
|
|
((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
|
2019-07-10 20:40:39 +00:00
|
|
|
(hpts->p_on_queue_cnt != 0)) {
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* Wheel wrap is occuring, basically we
|
|
|
|
* are behind and the distance between
|
|
|
|
* run's has spread so much it has exceeded
|
|
|
|
* the time on the wheel (1.024 seconds). This
|
|
|
|
* is ugly and should NOT be happening. We
|
|
|
|
* need to run the entire wheel. We last processed
|
|
|
|
* p_prev_slot, so that needs to be the last slot
|
|
|
|
* we run. The next slot after that should be our
|
|
|
|
* reserved first slot for new, and then starts
|
|
|
|
* the running postion. Now the problem is the
|
|
|
|
* reserved "not to yet" place does not exist
|
|
|
|
* and there may be inp's in there that need
|
|
|
|
* running. We can merge those into the
|
|
|
|
* first slot at the head.
|
|
|
|
*/
|
|
|
|
wrap_loop_cnt++;
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1);
|
|
|
|
hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2);
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* Adjust p_cur_slot to be where we are starting from
|
|
|
|
* hopefully we will catch up (fat chance if something
|
|
|
|
* is broken this bad :( )
|
|
|
|
*/
|
|
|
|
hpts->p_cur_slot = hpts->p_prev_slot;
|
|
|
|
/*
|
|
|
|
* The next slot has guys to run too, and that would
|
|
|
|
* be where we would normally start, lets move them into
|
|
|
|
* the next slot (p_prev_slot + 2) so that we will
|
|
|
|
* run them, the extra 10usecs of late (by being
|
|
|
|
* put behind) does not really matter in this situation.
|
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot].head,
|
|
|
|
inp_hpts) {
|
|
|
|
MPASS(inp->inp_hptsslot == hpts->p_nxt_slot);
|
|
|
|
MPASS(inp->inp_hpts_gencnt ==
|
|
|
|
hpts->p_hptss[hpts->p_nxt_slot].gencnt);
|
|
|
|
MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update gencnt and nextslot accordingly to match
|
|
|
|
* the new location. This is safe since it takes both
|
|
|
|
* the INP lock and the pacer mutex to change the
|
|
|
|
* inp_hptsslot and inp_hpts_gencnt.
|
|
|
|
*/
|
|
|
|
inp->inp_hpts_gencnt =
|
|
|
|
hpts->p_hptss[hpts->p_runningslot].gencnt;
|
2021-07-06 19:23:22 +00:00
|
|
|
inp->inp_hptsslot = hpts->p_runningslot;
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head,
|
|
|
|
&hpts->p_hptss[hpts->p_nxt_slot].head, inp_hpts);
|
|
|
|
hpts->p_hptss[hpts->p_runningslot].count +=
|
|
|
|
hpts->p_hptss[hpts->p_nxt_slot].count;
|
|
|
|
hpts->p_hptss[hpts->p_nxt_slot].count = 0;
|
|
|
|
hpts->p_hptss[hpts->p_nxt_slot].gencnt++;
|
2021-07-06 19:23:22 +00:00
|
|
|
slots_to_run = NUM_OF_HPTSI_SLOTS - 1;
|
2019-07-10 20:40:39 +00:00
|
|
|
counter_u64_add(wheel_wrap, 1);
|
|
|
|
} else {
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2021-07-06 19:23:22 +00:00
|
|
|
* Nxt slot is always one after p_runningslot though
|
2019-07-10 20:40:39 +00:00
|
|
|
* its not used usually unless we are doing wheel wrap.
|
|
|
|
*/
|
|
|
|
hpts->p_nxt_slot = hpts->p_prev_slot;
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) ||
|
|
|
|
((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))),
|
2021-07-06 19:23:22 +00:00
|
|
|
("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
|
2021-12-02 22:45:04 +00:00
|
|
|
__FUNCTION__, hpts, hpts->p_dropq_cnt));
|
2018-04-19 13:37:59 +00:00
|
|
|
if (hpts->p_on_queue_cnt == 0) {
|
|
|
|
goto no_one;
|
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
for (i = 0; i < slots_to_run; i++) {
|
2021-12-02 22:45:04 +00:00
|
|
|
struct inpcb *inp, *ninp;
|
|
|
|
TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
|
|
|
|
struct hptsh *hptsh;
|
|
|
|
uint32_t runningslot, gencnt;
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
/*
|
|
|
|
* Calculate our delay, if there are no extra ticks there
|
2021-07-06 19:23:22 +00:00
|
|
|
* was not any (i.e. if slots_to_run == 1, no delay).
|
2018-04-19 13:37:59 +00:00
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
hpts->p_delayed_by = (slots_to_run - (i + 1)) *
|
|
|
|
HPTS_TICKS_PER_SLOT;
|
|
|
|
|
|
|
|
runningslot = hpts->p_runningslot;
|
|
|
|
hptsh = &hpts->p_hptss[runningslot];
|
|
|
|
TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts);
|
|
|
|
hpts->p_on_queue_cnt -= hptsh->count;
|
|
|
|
hptsh->count = 0;
|
|
|
|
gencnt = hptsh->gencnt++;
|
|
|
|
|
|
|
|
HPTS_UNLOCK(hpts);
|
|
|
|
|
|
|
|
TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) {
|
|
|
|
bool set_cpu;
|
|
|
|
|
|
|
|
if (ninp != NULL) {
|
|
|
|
/* We prefetch the next inp if possible */
|
|
|
|
kern_prefetch(ninp, &prefetch_ninp);
|
|
|
|
prefetch_ninp = 1;
|
|
|
|
}
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
/* For debugging */
|
2021-07-06 19:23:22 +00:00
|
|
|
if (seen_endpoint == 0) {
|
|
|
|
seen_endpoint = 1;
|
2021-12-02 22:45:04 +00:00
|
|
|
orig_exit_slot = slot_pos_of_endpoint =
|
|
|
|
runningslot;
|
2021-07-06 19:23:22 +00:00
|
|
|
} else if (completed_measure == 0) {
|
|
|
|
/* Record the new position */
|
2021-12-02 22:45:04 +00:00
|
|
|
orig_exit_slot = runningslot;
|
2021-07-06 19:23:22 +00:00
|
|
|
}
|
|
|
|
total_slots_processed++;
|
2018-04-19 13:37:59 +00:00
|
|
|
paced_cnt++;
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
INP_WLOCK(inp);
|
2018-04-19 13:37:59 +00:00
|
|
|
if (inp->inp_hpts_cpu_set == 0) {
|
2021-12-02 22:45:04 +00:00
|
|
|
set_cpu = true;
|
2018-04-19 13:37:59 +00:00
|
|
|
} else {
|
2021-12-02 22:45:04 +00:00
|
|
|
set_cpu = false;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
if (__predict_false(inp->inp_in_hpts == IHPTS_MOVING)) {
|
|
|
|
if (inp->inp_hptsslot == -1) {
|
|
|
|
inp->inp_in_hpts = IHPTS_NONE;
|
|
|
|
if (in_pcbrele_wlocked(inp) == false)
|
|
|
|
INP_WUNLOCK(inp);
|
|
|
|
} else {
|
|
|
|
HPTS_LOCK(hpts);
|
|
|
|
inp_hpts_insert(inp, hpts);
|
|
|
|
HPTS_UNLOCK(hpts);
|
|
|
|
INP_WUNLOCK(inp);
|
|
|
|
}
|
|
|
|
continue;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
|
|
|
|
MPASS(!(inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)));
|
|
|
|
KASSERT(runningslot == inp->inp_hptsslot,
|
|
|
|
("Hpts:%p inp:%p slot mis-aligned %u vs %u",
|
|
|
|
hpts, inp, runningslot, inp->inp_hptsslot));
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
if (inp->inp_hpts_request) {
|
|
|
|
/*
|
|
|
|
* This guy is deferred out further in time
|
2020-02-12 13:07:09 +00:00
|
|
|
* then our wheel had available on it.
|
2019-07-10 20:40:39 +00:00
|
|
|
* Push him back on the wheel or run it
|
|
|
|
* depending.
|
2018-04-19 13:37:59 +00:00
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
uint32_t maxslots, last_slot, remaining_slots;
|
2020-02-12 13:07:09 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
remaining_slots = slots_to_run - (i + 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
if (inp->inp_hpts_request > remaining_slots) {
|
2021-12-02 22:45:04 +00:00
|
|
|
HPTS_LOCK(hpts);
|
2018-04-19 13:37:59 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* How far out can we go?
|
2018-04-19 13:37:59 +00:00
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
maxslots = max_slots_available(hpts,
|
|
|
|
hpts->p_cur_slot, &last_slot);
|
2021-07-06 19:23:22 +00:00
|
|
|
if (maxslots >= inp->inp_hpts_request) {
|
2021-12-02 22:45:04 +00:00
|
|
|
/* We can place it finally to
|
|
|
|
* be processed. */
|
|
|
|
inp->inp_hptsslot = hpts_slot(
|
|
|
|
hpts->p_runningslot,
|
|
|
|
inp->inp_hpts_request);
|
2019-07-10 20:40:39 +00:00
|
|
|
inp->inp_hpts_request = 0;
|
|
|
|
} else {
|
|
|
|
/* Work off some more time */
|
2021-07-06 19:23:22 +00:00
|
|
|
inp->inp_hptsslot = last_slot;
|
2021-12-02 22:45:04 +00:00
|
|
|
inp->inp_hpts_request -=
|
|
|
|
maxslots;
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
inp_hpts_insert(inp, hpts);
|
|
|
|
HPTS_UNLOCK(hpts);
|
|
|
|
INP_WUNLOCK(inp);
|
2018-04-19 13:37:59 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
inp->inp_hpts_request = 0;
|
2019-07-10 20:40:39 +00:00
|
|
|
/* Fall through we will so do it now */
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
|
|
|
inp_hpts_release(inp);
|
2018-04-19 13:37:59 +00:00
|
|
|
tp = intotcpcb(inp);
|
2021-12-02 22:45:04 +00:00
|
|
|
MPASS(tp);
|
2018-04-19 13:37:59 +00:00
|
|
|
if (set_cpu) {
|
|
|
|
/*
|
|
|
|
* Setup so the next time we will move to
|
|
|
|
* the right CPU. This should be a rare
|
|
|
|
* event. It will sometimes happens when we
|
|
|
|
* are the client side (usually not the
|
|
|
|
* server). Somehow tcp_output() gets called
|
|
|
|
* before the tcp_do_segment() sets the
|
|
|
|
* intial state. This means the r_cpu and
|
|
|
|
* r_hpts_cpu is 0. We get on the hpts, and
|
|
|
|
* then tcp_input() gets called setting up
|
|
|
|
* the r_cpu to the correct value. The hpts
|
|
|
|
* goes off and sees the mis-match. We
|
|
|
|
* simply correct it here and the CPU will
|
|
|
|
* switch to the new hpts nextime the tcb
|
|
|
|
* gets added to the the hpts (not this one)
|
|
|
|
* :-)
|
|
|
|
*/
|
|
|
|
tcp_set_hpts(inp);
|
|
|
|
}
|
2018-06-18 14:10:12 +00:00
|
|
|
CURVNET_SET(inp->inp_vnet);
|
2019-07-10 20:40:39 +00:00
|
|
|
/* Lets do any logging that we might want to */
|
|
|
|
if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
|
2021-07-06 19:23:22 +00:00
|
|
|
tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
|
2019-07-10 20:40:39 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
if (tp->t_fb_ptr != NULL) {
|
|
|
|
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
|
|
|
|
did_prefetch = 1;
|
|
|
|
}
|
2019-07-10 20:40:39 +00:00
|
|
|
if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
|
|
|
|
error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
|
|
|
|
if (error) {
|
|
|
|
/* The input killed the connection */
|
|
|
|
goto skip_pacing;
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2019-07-10 20:40:39 +00:00
|
|
|
inp->inp_hpts_calls = 1;
|
2021-12-26 16:47:59 +00:00
|
|
|
error = tcp_output(tp);
|
2021-12-26 16:48:19 +00:00
|
|
|
if (error < 0)
|
|
|
|
goto skip_pacing;
|
2019-07-10 20:40:39 +00:00
|
|
|
inp->inp_hpts_calls = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
if (ninp && ninp->inp_ppcb) {
|
|
|
|
/*
|
|
|
|
* If we have a nxt inp, see if we can
|
|
|
|
* prefetch its ppcb. Note this may seem
|
|
|
|
* "risky" since we have no locks (other
|
|
|
|
* than the previous inp) and there no
|
|
|
|
* assurance that ninp was not pulled while
|
|
|
|
* we were processing inp and freed. If this
|
|
|
|
* occured it could mean that either:
|
|
|
|
*
|
|
|
|
* a) Its NULL (which is fine we won't go
|
|
|
|
* here) <or> b) Its valid (which is cool we
|
|
|
|
* will prefetch it) <or> c) The inp got
|
|
|
|
* freed back to the slab which was
|
|
|
|
* reallocated. Then the piece of memory was
|
|
|
|
* re-used and something else (not an
|
|
|
|
* address) is in inp_ppcb. If that occurs
|
|
|
|
* we don't crash, but take a TLB shootdown
|
|
|
|
* performance hit (same as if it was NULL
|
|
|
|
* and we tried to pre-fetch it).
|
|
|
|
*
|
|
|
|
* Considering that the likelyhood of <c> is
|
|
|
|
* quite rare we will take a risk on doing
|
|
|
|
* this. If performance drops after testing
|
|
|
|
* we can always take this out. NB: the
|
|
|
|
* kern_prefetch on amd64 actually has
|
|
|
|
* protection against a bad address now via
|
|
|
|
* the DMAP_() tests. This will prevent the
|
|
|
|
* TLB hit, and instead if <c> occurs just
|
|
|
|
* cause us to load cache with a useless
|
|
|
|
* address (to us).
|
|
|
|
*/
|
|
|
|
kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
|
|
|
|
prefetch_tp = 1;
|
|
|
|
}
|
|
|
|
INP_WUNLOCK(inp);
|
2019-07-10 20:40:39 +00:00
|
|
|
skip_pacing:
|
2018-04-19 13:37:59 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
if (seen_endpoint) {
|
|
|
|
/*
|
|
|
|
* We now have a accurate distance between
|
|
|
|
* slot_pos_of_endpoint <-> orig_exit_slot
|
|
|
|
* to tell us how late we were, orig_exit_slot
|
|
|
|
* is where we calculated the end of our cycle to
|
|
|
|
* be when we first entered.
|
|
|
|
*/
|
|
|
|
completed_measure = 1;
|
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
HPTS_LOCK(hpts);
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->p_runningslot++;
|
|
|
|
if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) {
|
|
|
|
hpts->p_runningslot = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
no_one:
|
|
|
|
HPTS_MTX_ASSERT(hpts);
|
|
|
|
hpts->p_delayed_by = 0;
|
|
|
|
/*
|
|
|
|
* Check to see if we took an excess amount of time and need to run
|
|
|
|
* more ticks (if we did not hit eno-bufs).
|
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) ||
|
|
|
|
((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))),
|
2021-07-06 19:23:22 +00:00
|
|
|
("%s hpts:%p in_hpts cnt:%d queue state mismatch",
|
2021-12-02 22:45:04 +00:00
|
|
|
__FUNCTION__, hpts, hpts->p_dropq_cnt));
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts->p_prev_slot = hpts->p_cur_slot;
|
|
|
|
hpts->p_lasttick = hpts->p_curtick;
|
2021-07-06 19:23:22 +00:00
|
|
|
if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
|
2019-07-10 20:40:39 +00:00
|
|
|
/*
|
|
|
|
* Something is serious slow we have
|
|
|
|
* looped through processing the wheel
|
|
|
|
* and by the time we cleared the
|
|
|
|
* needs to run max_pacer_loops time
|
|
|
|
* we still needed to run. That means
|
|
|
|
* the system is hopelessly behind and
|
|
|
|
* can never catch up :(
|
|
|
|
*
|
|
|
|
* We will just lie to this thread
|
2020-02-12 13:07:09 +00:00
|
|
|
* and let it thing p_curtick is
|
2019-07-10 20:40:39 +00:00
|
|
|
* correct. When it next awakens
|
|
|
|
* it will find itself further behind.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
if (from_callout)
|
|
|
|
counter_u64_add(hpts_hopelessly_behind, 1);
|
2019-07-10 20:40:39 +00:00
|
|
|
goto no_run;
|
|
|
|
}
|
|
|
|
hpts->p_curtick = tcp_gethptstick(&tv);
|
|
|
|
hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
|
2021-07-06 19:23:22 +00:00
|
|
|
if (seen_endpoint == 0) {
|
|
|
|
/* We saw no endpoint but we may be looping */
|
|
|
|
orig_exit_slot = hpts->p_cur_slot;
|
|
|
|
}
|
2019-07-10 20:40:39 +00:00
|
|
|
if ((wrap_loop_cnt < 2) &&
|
|
|
|
(hpts->p_lasttick != hpts->p_curtick)) {
|
|
|
|
counter_u64_add(hpts_loops, 1);
|
|
|
|
loop_cnt++;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
no_run:
|
2021-07-06 19:23:22 +00:00
|
|
|
cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
|
2019-07-10 20:40:39 +00:00
|
|
|
/*
|
|
|
|
* Set flag to tell that we are done for
|
|
|
|
* any slot input that happens during
|
|
|
|
* input.
|
|
|
|
*/
|
|
|
|
hpts->p_wheel_complete = 1;
|
2020-02-12 13:07:09 +00:00
|
|
|
/*
|
2019-07-10 20:40:39 +00:00
|
|
|
* Run any input that may be there not covered
|
|
|
|
* in running data.
|
|
|
|
*/
|
2021-12-02 22:45:04 +00:00
|
|
|
tcp_delayed_drop(hpts);
|
|
|
|
/*
|
|
|
|
* Now did we spend too long running input and need to run more ticks?
|
|
|
|
* Note that if wrap_loop_cnt < 2 then we should have the conditions
|
|
|
|
* in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
|
|
|
|
* is greater than 2, then the condtion most likely are *not* true.
|
|
|
|
* Also if we are called not from the callout, we don't run the wheel
|
|
|
|
* multiple times so the slots may not align either.
|
|
|
|
*/
|
|
|
|
KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
|
|
|
|
(wrap_loop_cnt >= 2) || (from_callout == 0)),
|
|
|
|
("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
|
|
|
|
hpts->p_prev_slot, hpts->p_cur_slot));
|
|
|
|
KASSERT(((hpts->p_lasttick == hpts->p_curtick)
|
|
|
|
|| (wrap_loop_cnt >= 2) || (from_callout == 0)),
|
|
|
|
("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
|
|
|
|
hpts->p_lasttick, hpts->p_curtick));
|
|
|
|
if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
|
|
|
|
hpts->p_curtick = tcp_gethptstick(&tv);
|
|
|
|
counter_u64_add(hpts_loops, 1);
|
|
|
|
hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
|
|
|
|
goto again;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-12-02 22:45:04 +00:00
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
if (from_callout){
|
|
|
|
tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
if (seen_endpoint)
|
|
|
|
return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot));
|
|
|
|
else
|
|
|
|
return (0);
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
__tcp_set_hpts(struct inpcb *inp, int32_t line)
|
|
|
|
{
|
|
|
|
struct tcp_hpts_entry *hpts;
|
2021-07-06 19:23:22 +00:00
|
|
|
int failed;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
INP_WLOCK_ASSERT(inp);
|
|
|
|
hpts = tcp_hpts_lock(inp);
|
|
|
|
if ((inp->inp_in_hpts == 0) &&
|
|
|
|
(inp->inp_hpts_cpu_set == 0)) {
|
2021-07-06 19:23:22 +00:00
|
|
|
inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
|
|
|
|
if (failed == 0)
|
|
|
|
inp->inp_hpts_cpu_set = 1;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
mtx_unlock(&hpts->p_mtx);
|
2021-12-02 22:45:04 +00:00
|
|
|
hpts = tcp_dropq_lock(inp);
|
|
|
|
if ((inp->inp_dropq_cpu_set == 0) &&
|
|
|
|
(inp->inp_in_dropq == 0)) {
|
|
|
|
inp->inp_dropq_cpu = hpts_cpuid(inp, &failed);
|
2021-07-06 19:23:22 +00:00
|
|
|
if (failed == 0)
|
2021-12-02 22:45:04 +00:00
|
|
|
inp->inp_dropq_cpu_set = 1;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
mtx_unlock(&hpts->p_mtx);
|
|
|
|
}
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
static void
|
|
|
|
__tcp_run_hpts(struct tcp_hpts_entry *hpts)
|
|
|
|
{
|
|
|
|
int ticks_ran;
|
|
|
|
|
|
|
|
if (hpts->p_hpts_active) {
|
|
|
|
/* Already active */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (mtx_trylock(&hpts->p_mtx) == 0) {
|
|
|
|
/* Someone else got the lock */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (hpts->p_hpts_active)
|
|
|
|
goto out_with_mtx;
|
|
|
|
hpts->syscall_cnt++;
|
|
|
|
counter_u64_add(hpts_direct_call, 1);
|
|
|
|
hpts->p_hpts_active = 1;
|
|
|
|
ticks_ran = tcp_hptsi(hpts, 0);
|
|
|
|
/* We may want to adjust the sleep values here */
|
|
|
|
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
|
|
|
|
if (ticks_ran > ticks_indicate_less_sleep) {
|
|
|
|
struct timeval tv;
|
|
|
|
sbintime_t sb;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
hpts->p_mysleep.tv_usec /= 2;
|
|
|
|
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
|
|
|
|
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
|
|
|
|
/* Reschedule with new to value */
|
|
|
|
tcp_hpts_set_max_sleep(hpts, 0);
|
|
|
|
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
|
|
|
|
/* Validate its in the right ranges */
|
|
|
|
if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
|
|
|
|
hpts->overidden_sleep = tv.tv_usec;
|
|
|
|
tv.tv_usec = hpts->p_mysleep.tv_usec;
|
|
|
|
} else if (tv.tv_usec > dynamic_max_sleep) {
|
|
|
|
/* Lets not let sleep get above this value */
|
|
|
|
hpts->overidden_sleep = tv.tv_usec;
|
|
|
|
tv.tv_usec = dynamic_max_sleep;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* In this mode the timer is a backstop to
|
|
|
|
* all the userret/lro_flushes so we use
|
|
|
|
* the dynamic value and set the on_min_sleep
|
|
|
|
* flag so we will not be awoken.
|
|
|
|
*/
|
|
|
|
sb = tvtosbt(tv);
|
|
|
|
cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
|
|
|
|
/* Store off to make visible the actual sleep time */
|
|
|
|
hpts->sleeping = tv.tv_usec;
|
|
|
|
callout_reset_sbt_on(&hpts->co, sb, 0,
|
|
|
|
hpts_timeout_swi, hpts, cpu,
|
|
|
|
(C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
|
|
|
|
} else if (ticks_ran < ticks_indicate_more_sleep) {
|
|
|
|
/* For the further sleep, don't reschedule hpts */
|
|
|
|
hpts->p_mysleep.tv_usec *= 2;
|
|
|
|
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
|
|
|
|
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
|
|
|
|
}
|
|
|
|
hpts->p_on_min_sleep = 1;
|
|
|
|
}
|
|
|
|
hpts->p_hpts_active = 0;
|
|
|
|
out_with_mtx:
|
|
|
|
HPTS_MTX_ASSERT(hpts);
|
|
|
|
mtx_unlock(&hpts->p_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct tcp_hpts_entry *
|
|
|
|
tcp_choose_hpts_to_run()
|
|
|
|
{
|
|
|
|
int i, oldest_idx;
|
|
|
|
uint32_t cts, time_since_ran, calc;
|
|
|
|
|
|
|
|
if ((hpts_uses_oldest == 0) ||
|
|
|
|
((hpts_uses_oldest > 1) &&
|
|
|
|
(tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) {
|
|
|
|
/*
|
|
|
|
* We have either disabled the feature (0), or
|
|
|
|
* we have crossed over the oldest threshold on the
|
|
|
|
* last hpts. We use the last one for simplification
|
|
|
|
* since we don't want to use the first one (it may
|
|
|
|
* have starting connections that have not settled
|
|
|
|
* on the cpu yet).
|
|
|
|
*/
|
|
|
|
return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
|
|
|
|
}
|
|
|
|
/* Lets find the oldest hpts to attempt to run */
|
|
|
|
cts = tcp_get_usecs(NULL);
|
|
|
|
time_since_ran = 0;
|
|
|
|
oldest_idx = -1;
|
|
|
|
for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
|
|
|
|
if (TSTMP_GT(cts, cts_last_ran[i]))
|
|
|
|
calc = cts - cts_last_ran[i];
|
|
|
|
else
|
|
|
|
calc = 0;
|
|
|
|
if (calc > time_since_ran) {
|
|
|
|
oldest_idx = i;
|
|
|
|
time_since_ran = calc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (oldest_idx >= 0)
|
|
|
|
return(tcp_pace.rp_ent[oldest_idx]);
|
|
|
|
else
|
|
|
|
return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
tcp_run_hpts(void)
|
|
|
|
{
|
|
|
|
static struct tcp_hpts_entry *hpts;
|
|
|
|
struct epoch_tracker et;
|
|
|
|
|
|
|
|
NET_EPOCH_ENTER(et);
|
|
|
|
hpts = tcp_choose_hpts_to_run();
|
|
|
|
__tcp_run_hpts(hpts);
|
|
|
|
NET_EPOCH_EXIT(et);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
static void
|
|
|
|
tcp_hpts_thread(void *ctx)
|
|
|
|
{
|
|
|
|
struct tcp_hpts_entry *hpts;
|
2019-11-07 21:30:27 +00:00
|
|
|
struct epoch_tracker et;
|
2018-04-19 13:37:59 +00:00
|
|
|
struct timeval tv;
|
|
|
|
sbintime_t sb;
|
2021-07-06 19:23:22 +00:00
|
|
|
int cpu, ticks_ran;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
hpts = (struct tcp_hpts_entry *)ctx;
|
|
|
|
mtx_lock(&hpts->p_mtx);
|
|
|
|
if (hpts->p_direct_wake) {
|
2021-07-06 19:23:22 +00:00
|
|
|
/* Signaled by input or output with low occupancy count. */
|
2018-04-19 13:37:59 +00:00
|
|
|
callout_stop(&hpts->co);
|
2021-07-06 19:23:22 +00:00
|
|
|
counter_u64_add(hpts_direct_awakening, 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
} else {
|
2021-07-06 19:23:22 +00:00
|
|
|
/* Timed out, the normal case. */
|
|
|
|
counter_u64_add(hpts_wake_timeout, 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
if (callout_pending(&hpts->co) ||
|
|
|
|
!callout_active(&hpts->co)) {
|
|
|
|
mtx_unlock(&hpts->p_mtx);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
callout_deactivate(&hpts->co);
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts->p_hpts_wake_scheduled = 0;
|
2019-11-07 21:30:27 +00:00
|
|
|
NET_EPOCH_ENTER(et);
|
2021-07-06 19:23:22 +00:00
|
|
|
if (hpts->p_hpts_active) {
|
|
|
|
/*
|
|
|
|
* We are active already. This means that a syscall
|
|
|
|
* trap or LRO is running in behalf of hpts. In that case
|
|
|
|
* we need to double our timeout since there seems to be
|
|
|
|
* enough activity in the system that we don't need to
|
|
|
|
* run as often (if we were not directly woken).
|
|
|
|
*/
|
|
|
|
if (hpts->p_direct_wake == 0) {
|
|
|
|
counter_u64_add(hpts_back_tosleep, 1);
|
|
|
|
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
|
|
|
|
hpts->p_mysleep.tv_usec *= 2;
|
|
|
|
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
|
|
|
|
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
|
|
|
|
tv.tv_usec = hpts->p_mysleep.tv_usec;
|
|
|
|
hpts->p_on_min_sleep = 1;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Here we have low count on the wheel, but
|
|
|
|
* somehow we still collided with one of the
|
|
|
|
* connections. Lets go back to sleep for a
|
|
|
|
* min sleep time, but clear the flag so we
|
|
|
|
* can be awoken by insert.
|
|
|
|
*/
|
|
|
|
hpts->p_on_min_sleep = 0;
|
|
|
|
tv.tv_usec = tcp_min_hptsi_time;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Directly woken most likely to reset the
|
|
|
|
* callout time.
|
|
|
|
*/
|
|
|
|
tv.tv_sec = 0;
|
|
|
|
tv.tv_usec = hpts->p_mysleep.tv_usec;
|
|
|
|
}
|
|
|
|
goto back_to_sleep;
|
|
|
|
}
|
|
|
|
hpts->sleeping = 0;
|
|
|
|
hpts->p_hpts_active = 1;
|
|
|
|
ticks_ran = tcp_hptsi(hpts, 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
tv.tv_sec = 0;
|
2021-07-06 19:23:22 +00:00
|
|
|
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
|
|
|
|
if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
|
|
|
|
if(hpts->p_direct_wake == 0) {
|
|
|
|
/*
|
|
|
|
* Only adjust sleep time if we were
|
|
|
|
* called from the callout i.e. direct_wake == 0.
|
|
|
|
*/
|
|
|
|
if (ticks_ran < ticks_indicate_more_sleep) {
|
|
|
|
hpts->p_mysleep.tv_usec *= 2;
|
|
|
|
if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
|
|
|
|
hpts->p_mysleep.tv_usec = dynamic_max_sleep;
|
|
|
|
} else if (ticks_ran > ticks_indicate_less_sleep) {
|
|
|
|
hpts->p_mysleep.tv_usec /= 2;
|
|
|
|
if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
|
|
|
|
hpts->p_mysleep.tv_usec = dynamic_min_sleep;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
|
|
|
|
hpts->overidden_sleep = tv.tv_usec;
|
|
|
|
tv.tv_usec = hpts->p_mysleep.tv_usec;
|
|
|
|
} else if (tv.tv_usec > dynamic_max_sleep) {
|
|
|
|
/* Lets not let sleep get above this value */
|
|
|
|
hpts->overidden_sleep = tv.tv_usec;
|
|
|
|
tv.tv_usec = dynamic_max_sleep;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* In this mode the timer is a backstop to
|
|
|
|
* all the userret/lro_flushes so we use
|
|
|
|
* the dynamic value and set the on_min_sleep
|
|
|
|
* flag so we will not be awoken.
|
|
|
|
*/
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts->p_on_min_sleep = 1;
|
2021-07-06 19:23:22 +00:00
|
|
|
} else if (hpts->p_on_queue_cnt == 0) {
|
|
|
|
/*
|
|
|
|
* No one on the wheel, please wake us up
|
|
|
|
* if you insert on the wheel.
|
|
|
|
*/
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts->p_on_min_sleep = 0;
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->overidden_sleep = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
} else {
|
2021-07-06 19:23:22 +00:00
|
|
|
/*
|
|
|
|
* We hit here when we have a low number of
|
|
|
|
* clients on the wheel (our else clause).
|
|
|
|
* We may need to go on min sleep, if we set
|
|
|
|
* the flag we will not be awoken if someone
|
|
|
|
* is inserted ahead of us. Clearing the flag
|
|
|
|
* means we can be awoken. This is "old mode"
|
|
|
|
* where the timer is what runs hpts mainly.
|
|
|
|
*/
|
|
|
|
if (tv.tv_usec < tcp_min_hptsi_time) {
|
|
|
|
/*
|
|
|
|
* Yes on min sleep, which means
|
|
|
|
* we cannot be awoken.
|
|
|
|
*/
|
|
|
|
hpts->overidden_sleep = tv.tv_usec;
|
|
|
|
tv.tv_usec = tcp_min_hptsi_time;
|
|
|
|
hpts->p_on_min_sleep = 1;
|
|
|
|
} else {
|
|
|
|
/* Clear the min sleep flag */
|
|
|
|
hpts->overidden_sleep = 0;
|
|
|
|
hpts->p_on_min_sleep = 0;
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2021-07-06 19:23:22 +00:00
|
|
|
HPTS_MTX_ASSERT(hpts);
|
|
|
|
hpts->p_hpts_active = 0;
|
|
|
|
back_to_sleep:
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts->p_direct_wake = 0;
|
2021-07-06 19:23:22 +00:00
|
|
|
sb = tvtosbt(tv);
|
|
|
|
cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
|
|
|
|
/* Store off to make visible the actual sleep time */
|
|
|
|
hpts->sleeping = tv.tv_usec;
|
|
|
|
callout_reset_sbt_on(&hpts->co, sb, 0,
|
|
|
|
hpts_timeout_swi, hpts, cpu,
|
|
|
|
(C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
|
|
|
|
NET_EPOCH_EXIT(et);
|
2018-04-19 13:37:59 +00:00
|
|
|
mtx_unlock(&hpts->p_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef timersub
|
|
|
|
|
|
|
|
static void
|
|
|
|
tcp_init_hptsi(void *st)
|
|
|
|
{
|
|
|
|
int32_t i, j, error, bound = 0, created = 0;
|
|
|
|
size_t sz, asz;
|
|
|
|
struct timeval tv;
|
|
|
|
sbintime_t sb;
|
|
|
|
struct tcp_hpts_entry *hpts;
|
2019-05-10 13:41:19 +00:00
|
|
|
struct pcpu *pc;
|
|
|
|
cpuset_t cs;
|
2018-04-19 13:37:59 +00:00
|
|
|
char unit[16];
|
|
|
|
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
|
2021-07-06 19:23:22 +00:00
|
|
|
int count, domain, cpu;
|
2018-04-19 13:37:59 +00:00
|
|
|
|
|
|
|
tcp_pace.rp_num_hptss = ncpus;
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts_loops = counter_u64_alloc(M_WAITOK);
|
|
|
|
back_tosleep = counter_u64_alloc(M_WAITOK);
|
2019-07-10 20:40:39 +00:00
|
|
|
combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
|
|
|
|
wheel_wrap = counter_u64_alloc(M_WAITOK);
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
|
|
|
|
hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
|
|
|
|
hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
|
|
|
|
hpts_direct_call = counter_u64_alloc(M_WAITOK);
|
|
|
|
cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
|
|
|
|
cpu_uses_random = counter_u64_alloc(M_WAITOK);
|
|
|
|
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
|
|
|
|
tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
|
2021-07-06 19:23:22 +00:00
|
|
|
sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
|
|
|
|
cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
|
2018-04-19 13:37:59 +00:00
|
|
|
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
|
|
|
|
for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
|
|
|
|
tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
|
|
|
|
M_TCPHPTS, M_WAITOK | M_ZERO);
|
|
|
|
tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
|
|
|
|
M_TCPHPTS, M_WAITOK);
|
|
|
|
hpts = tcp_pace.rp_ent[i];
|
|
|
|
/*
|
|
|
|
* Init all the hpts structures that are not specifically
|
|
|
|
* zero'd by the allocations. Also lets attach them to the
|
|
|
|
* appropriate sysctl block as well.
|
|
|
|
*/
|
|
|
|
mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
|
|
|
|
"hpts", MTX_DEF | MTX_DUPOK);
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_INIT(&hpts->p_dropq);
|
2018-04-19 13:37:59 +00:00
|
|
|
for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
|
2021-12-02 22:45:04 +00:00
|
|
|
TAILQ_INIT(&hpts->p_hptss[j].head);
|
|
|
|
hpts->p_hptss[j].count = 0;
|
|
|
|
hpts->p_hptss[j].gencnt = 0;
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
sysctl_ctx_init(&hpts->hpts_ctx);
|
|
|
|
sprintf(unit, "%d", i);
|
|
|
|
hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
|
|
|
|
OID_AUTO,
|
|
|
|
unit,
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
2018-04-19 13:37:59 +00:00
|
|
|
"");
|
|
|
|
SYSCTL_ADD_INT(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "in_qcnt", CTLFLAG_RD,
|
2021-12-02 22:45:04 +00:00
|
|
|
&hpts->p_dropq_cnt, 0,
|
|
|
|
"Count TCB's awaiting delayed drop");
|
2018-04-19 13:37:59 +00:00
|
|
|
SYSCTL_ADD_INT(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "out_qcnt", CTLFLAG_RD,
|
|
|
|
&hpts->p_on_queue_cnt, 0,
|
|
|
|
"Count TCB's awaiting output processing");
|
2019-07-10 20:40:39 +00:00
|
|
|
SYSCTL_ADD_U16(&hpts->hpts_ctx,
|
2018-04-19 13:37:59 +00:00
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "active", CTLFLAG_RD,
|
|
|
|
&hpts->p_hpts_active, 0,
|
|
|
|
"Is the hpts active");
|
|
|
|
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "curslot", CTLFLAG_RD,
|
|
|
|
&hpts->p_cur_slot, 0,
|
2019-07-10 20:40:39 +00:00
|
|
|
"What the current running pacers goal");
|
2018-04-19 13:37:59 +00:00
|
|
|
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
2019-07-10 20:40:39 +00:00
|
|
|
OID_AUTO, "runtick", CTLFLAG_RD,
|
2021-07-06 19:23:22 +00:00
|
|
|
&hpts->p_runningslot, 0,
|
2019-07-10 20:40:39 +00:00
|
|
|
"What the running pacers current slot is");
|
2018-04-19 13:37:59 +00:00
|
|
|
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
2019-07-10 20:40:39 +00:00
|
|
|
OID_AUTO, "curtick", CTLFLAG_RD,
|
|
|
|
&hpts->p_curtick, 0,
|
|
|
|
"What the running pacers last tick mapped to the wheel was");
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "lastran", CTLFLAG_RD,
|
|
|
|
&cts_last_ran[i], 0,
|
|
|
|
"The last usec tick that this hpts ran");
|
2021-07-08 11:06:58 +00:00
|
|
|
SYSCTL_ADD_LONG(&hpts->hpts_ctx,
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
|
2021-07-08 11:06:58 +00:00
|
|
|
&hpts->p_mysleep.tv_usec,
|
2021-07-06 19:23:22 +00:00
|
|
|
"What the running pacers is using for p_mysleep.tv_usec");
|
|
|
|
SYSCTL_ADD_U64(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "now_sleeping", CTLFLAG_RD,
|
|
|
|
&hpts->sleeping, 0,
|
|
|
|
"What the running pacers is actually sleeping for");
|
|
|
|
SYSCTL_ADD_U64(&hpts->hpts_ctx,
|
|
|
|
SYSCTL_CHILDREN(hpts->hpts_root),
|
|
|
|
OID_AUTO, "syscall_cnt", CTLFLAG_RD,
|
|
|
|
&hpts->syscall_cnt, 0,
|
|
|
|
"How many times we had syscalls on this hpts");
|
|
|
|
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts->p_hpts_sleep_time = hpts_sleep_max;
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts->p_num = i;
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts->p_curtick = tcp_gethptstick(&tv);
|
2021-07-06 19:23:22 +00:00
|
|
|
cts_last_ran[i] = tcp_tv_to_usectick(&tv);
|
2019-07-10 20:40:39 +00:00
|
|
|
hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts->p_cpu = 0xffff;
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
|
2018-04-19 13:37:59 +00:00
|
|
|
callout_init(&hpts->co, 1);
|
|
|
|
}
|
2019-05-10 13:41:19 +00:00
|
|
|
|
|
|
|
/* Don't try to bind to NUMA domains if we don't have any */
|
|
|
|
if (vm_ndomains == 1 && tcp_bind_threads == 2)
|
|
|
|
tcp_bind_threads = 0;
|
|
|
|
|
2018-04-19 13:37:59 +00:00
|
|
|
/*
|
|
|
|
* Now lets start ithreads to handle the hptss.
|
|
|
|
*/
|
2021-07-06 19:23:22 +00:00
|
|
|
for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
|
2018-04-19 13:37:59 +00:00
|
|
|
hpts = tcp_pace.rp_ent[i];
|
|
|
|
hpts->p_cpu = i;
|
|
|
|
error = swi_add(&hpts->ie, "hpts",
|
|
|
|
tcp_hpts_thread, (void *)hpts,
|
|
|
|
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
|
2021-07-06 19:23:22 +00:00
|
|
|
KASSERT(error == 0,
|
|
|
|
("Can't add hpts:%p i:%d err:%d",
|
|
|
|
hpts, i, error));
|
2018-04-19 13:37:59 +00:00
|
|
|
created++;
|
2021-07-06 19:23:22 +00:00
|
|
|
hpts->p_mysleep.tv_sec = 0;
|
|
|
|
hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
|
2019-05-10 13:41:19 +00:00
|
|
|
if (tcp_bind_threads == 1) {
|
2018-04-19 13:37:59 +00:00
|
|
|
if (intr_event_bind(hpts->ie, i) == 0)
|
|
|
|
bound++;
|
2019-05-10 13:41:19 +00:00
|
|
|
} else if (tcp_bind_threads == 2) {
|
|
|
|
pc = pcpu_find(i);
|
|
|
|
domain = pc->pc_domain;
|
|
|
|
CPU_COPY(&cpuset_domain[domain], &cs);
|
|
|
|
if (intr_event_bind_ithread_cpuset(hpts->ie, &cs)
|
|
|
|
== 0) {
|
|
|
|
bound++;
|
|
|
|
count = hpts_domains[domain].count;
|
|
|
|
hpts_domains[domain].cpu[count] = i;
|
|
|
|
hpts_domains[domain].count++;
|
|
|
|
}
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
tv.tv_sec = 0;
|
2021-07-06 19:23:22 +00:00
|
|
|
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
|
|
|
|
hpts->sleeping = tv.tv_usec;
|
2018-04-19 13:37:59 +00:00
|
|
|
sb = tvtosbt(tv);
|
2021-07-06 19:23:22 +00:00
|
|
|
cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
|
|
|
|
callout_reset_sbt_on(&hpts->co, sb, 0,
|
|
|
|
hpts_timeout_swi, hpts, cpu,
|
|
|
|
(C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
2019-05-10 13:41:19 +00:00
|
|
|
/*
|
|
|
|
* If we somehow have an empty domain, fall back to choosing
|
|
|
|
* among all htps threads.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
|
|
|
if (hpts_domains[i].count == 0) {
|
|
|
|
tcp_bind_threads = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
|
|
|
|
created, bound,
|
|
|
|
tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
|
2021-07-06 19:23:22 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
printf("HPTS is in INVARIANT mode!!\n");
|
|
|
|
#endif
|
2018-04-19 13:37:59 +00:00
|
|
|
}
|
|
|
|
|
2021-07-06 19:23:22 +00:00
|
|
|
SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL);
|
2018-06-11 14:27:19 +00:00
|
|
|
MODULE_VERSION(tcphpts, 1);
|