1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-16 10:20:30 +00:00

The FreeBSD-SA-14:08.tcp was a lesson on not doing acrobatics with

mixing on stack memory and UMA memory in one linked list.

Thus, rewrite TCP reassembly code in terms of memory usage. The
algorithm remains unchanged.

We actually do not need extra memory to build a reassembly queue.
Arriving mbufs are always packet header mbufs. So we got the length
of data as pkthdr.len. We got m_nextpkt for linkage. And we need
only one pointer to point at the tcphdr, use PH_loc for that.

In tcpcb the t_segq fields becomes mbuf pointer. The t_segqlen
field now counts not packets, but bytes in the queue. This gives
us more precision when comparing to socket buffer limits.

Sponsored by:	Netflix
Sponsored by:	Nginx, Inc.
This commit is contained in:
Gleb Smirnoff 2014-05-04 23:25:32 +00:00
parent 4f31a94bd2
commit e407b67be4
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=265338
6 changed files with 77 additions and 173 deletions

View File

@ -1639,8 +1639,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_nxt == tp->snd_max &&
tiwin && tiwin == tp->snd_wnd &&
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
LIST_EMPTY(&tp->t_segq) &&
((to.to_flags & TOF_TS) == 0 ||
tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
/*
@ -2908,8 +2907,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* immediately when segments are out of order (so
* fast retransmit can work).
*/
if (th->th_seq == tp->rcv_nxt &&
LIST_EMPTY(&tp->t_segq) &&
if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
TCPS_HAVEESTABLISHED(tp->t_state)) {
if (DELAY_ACK(tp, tlen))
tp->t_flags |= TF_DELACK;

View File

@ -71,19 +71,10 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
"TCP Segment Reassembly Queue");
static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg)
SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
&VNET_NAME(tcp_reass_maxseg), 0,
"Global maximum number of TCP Segments in Reassembly Queue");
static VNET_DEFINE(int, tcp_reass_overflows) = 0;
#define V_tcp_reass_overflows VNET(tcp_reass_overflows)
SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
@ -91,78 +82,32 @@ SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
&VNET_NAME(tcp_reass_overflows), 0,
"Global number of TCP Segment Reassembly Queue Overflows");
static VNET_DEFINE(uma_zone_t, tcp_reass_zone);
#define V_tcp_reass_zone VNET(tcp_reass_zone)
SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
&VNET_NAME(tcp_reass_zone),
"Global number of TCP Segments currently in Reassembly Queue");
/* Initialize TCP reassembly queue */
static void
tcp_reass_zone_change(void *tag)
{
/* Set the zone limit and read back the effective value. */
V_tcp_reass_maxseg = nmbclusters / 16;
V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
V_tcp_reass_maxseg);
}
void
tcp_reass_init(void)
{
V_tcp_reass_maxseg = nmbclusters / 16;
TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
&V_tcp_reass_maxseg);
V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
/* Set the zone limit and read back the effective value. */
V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
V_tcp_reass_maxseg);
EVENTHANDLER_REGISTER(nmbclusters_change,
tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
}
#ifdef VIMAGE
void
tcp_reass_destroy(void)
{
uma_zdestroy(V_tcp_reass_zone);
}
#endif
void
tcp_reass_flush(struct tcpcb *tp)
{
struct tseg_qent *qe;
struct mbuf *m;
INP_WLOCK_ASSERT(tp->t_inpcb);
while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(qe, tqe_q);
m_freem(qe->tqe_m);
uma_zfree(V_tcp_reass_zone, qe);
tp->t_segqlen--;
while ((m = tp->t_segq) != NULL) {
tp->t_segq = m->m_nextpkt;
tp->t_segqlen -= m->m_pkthdr.len;
m_freem(m);
}
KASSERT((tp->t_segqlen == 0),
("TCP reass queue %p segment count is %d instead of 0 after flush.",
("TCP reass queue %p length is %d instead of 0 after flush.",
tp, tp->t_segqlen));
}
#define M_TCPHDR(m) ((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
{
struct tseg_qent *q;
struct tseg_qent *p = NULL;
struct tseg_qent *nq;
struct tseg_qent *te = NULL;
struct socket *so = tp->t_inpcb->inp_socket;
char *s = NULL;
int flags;
struct tseg_qent tqs;
struct mbuf *mq, *mp;
int flags, wakeup;
INP_WLOCK_ASSERT(tp->t_inpcb);
@ -178,6 +123,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
if (th == NULL)
goto present;
M_ASSERTPKTHDR(m);
KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
*tlenp, m->m_pkthdr.len));
/*
* Limit the number of segments that can be queued to reduce the
* potential for mbuf exhaustion. For best performance, we want to be
@ -188,19 +137,17 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* Always let the missing segment through which caused this queue.
* NB: Access to the socket buffer is left intentionally unlocked as we
* can tolerate stale information here.
*
* XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
* should work but causes packets to be dropped when they shouldn't.
* Investigate why and re-evaluate the below limit after the behaviour
* is understood.
*/
if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
char *s;
V_tcp_reass_overflows++;
TCPSTAT_INC(tcps_rcvmemdrop);
m_freem(m);
*tlenp = 0;
if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
NULL))) {
log(LOG_DEBUG, "%s; %s: queue limit reached, "
"segment dropped\n", s, __func__);
free(s, M_TCPLOG);
@ -208,47 +155,14 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
return (0);
}
/*
* Allocate a new queue entry. If we can't, or hit the zone limit
* just drop the pkt.
*
* Use a temporary structure on the stack for the missing segment
* when the zone is exhausted. Otherwise we may get stuck.
*/
te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT);
if (te == NULL) {
if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) {
TCPSTAT_INC(tcps_rcvmemdrop);
m_freem(m);
*tlenp = 0;
if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
NULL))) {
log(LOG_DEBUG, "%s; %s: global zone limit "
"reached, segment dropped\n", s, __func__);
free(s, M_TCPLOG);
}
return (0);
} else {
bzero(&tqs, sizeof(struct tseg_qent));
te = &tqs;
if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
NULL))) {
log(LOG_DEBUG,
"%s; %s: global zone limit reached, using "
"stack for missing segment\n", s, __func__);
free(s, M_TCPLOG);
}
}
}
tp->t_segqlen++;
/*
* Find a segment which begins after this one does.
*/
LIST_FOREACH(q, &tp->t_segq, tqe_q) {
if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
mp = NULL;
for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
break;
p = q;
mp = mq;
}
/*
@ -256,18 +170,16 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
*/
if (p != NULL) {
if (mp != NULL) {
int i;
/* conversion to int (in i) handles seq wraparound */
i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
if (i > 0) {
if (i >= *tlenp) {
TCPSTAT_INC(tcps_rcvduppack);
TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
m_freem(m);
if (te != &tqs)
uma_zfree(V_tcp_reass_zone, te);
tp->t_segqlen--;
/*
* Try to present any queued data
* at the left window edge to the user.
@ -289,37 +201,40 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
while (q) {
int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
while (mq) {
struct mbuf *nq;
int i;
i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
if (i <= 0)
break;
if (i < q->tqe_len) {
q->tqe_th->th_seq += i;
q->tqe_len -= i;
m_adj(q->tqe_m, i);
if (i < mq->m_pkthdr.len) {
M_TCPHDR(mq)->th_seq += i;
m_adj(mq, i);
tp->t_segqlen -= i;
break;
}
nq = LIST_NEXT(q, tqe_q);
LIST_REMOVE(q, tqe_q);
m_freem(q->tqe_m);
uma_zfree(V_tcp_reass_zone, q);
tp->t_segqlen--;
q = nq;
nq = mq->m_nextpkt;
tp->t_segqlen -= mq->m_pkthdr.len;
m_freem(mq);
if (mp)
mp->m_nextpkt = nq;
else
tp->t_segq = nq;
mq = nq;
}
/* Insert the new segment queue entry into place. */
te->tqe_m = m;
te->tqe_th = th;
te->tqe_len = *tlenp;
if (p == NULL) {
LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
if (mp) {
m->m_nextpkt = mp->m_nextpkt;
mp->m_nextpkt = m;
} else {
KASSERT(te != &tqs, ("%s: temporary stack based entry not "
"first element in queue", __func__));
LIST_INSERT_AFTER(p, te, tqe_q);
m->m_nextpkt = tp->t_segq;
tp->t_segq = m ;
}
m->m_pkthdr.pkt_tcphdr = th;
tp->t_segqlen += m->m_pkthdr.len;
present:
/*
@ -328,25 +243,30 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
*/
if (!TCPS_HAVEESTABLISHED(tp->t_state))
return (0);
q = LIST_FIRST(&tp->t_segq);
if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
return (0);
flags = 0;
wakeup = 0;
SOCKBUF_LOCK(&so->so_rcv);
do {
tp->rcv_nxt += q->tqe_len;
flags = q->tqe_th->th_flags & TH_FIN;
nq = LIST_NEXT(q, tqe_q);
LIST_REMOVE(q, tqe_q);
while ((mq = tp->t_segq) != NULL &&
M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
tp->t_segq = mq->m_nextpkt;
tp->rcv_nxt += mq->m_pkthdr.len;
tp->t_segqlen -= mq->m_pkthdr.len;
flags = M_TCPHDR(mq)->th_flags & TH_FIN;
if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
m_freem(q->tqe_m);
else
sbappendstream_locked(&so->so_rcv, q->tqe_m);
if (q != &tqs)
uma_zfree(V_tcp_reass_zone, q);
tp->t_segqlen--;
q = nq;
} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
m_freem(mq);
else {
mq->m_nextpkt = NULL;
sbappendstream_locked(&so->so_rcv, mq);
wakeup = 1;
}
}
ND6_HINT(tp);
sorwakeup_locked(so);
if (wakeup)
sorwakeup_locked(so);
else
SOCKBUF_UNLOCK(&so->so_rcv);
return (flags);
}

View File

@ -375,7 +375,6 @@ tcp_init(void)
tcp_tw_init();
syncache_init();
tcp_hc_init();
tcp_reass_init();
TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
@ -433,7 +432,6 @@ tcp_destroy(void)
{
int error;
tcp_reass_destroy();
tcp_hc_destroy();
syncache_destroy();
tcp_tw_destroy();

View File

@ -1949,7 +1949,7 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
db_print_indent(indent);
db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
tp->t_segq, tp->t_segqlen, tp->t_dupacks);
db_print_indent(indent);
db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",

View File

@ -46,15 +46,6 @@ VNET_DECLARE(int, tcp_do_rfc1323);
#endif /* _KERNEL */
/* TCP segment queue entry */
struct tseg_qent {
LIST_ENTRY(tseg_qent) tqe_q;
int tqe_len; /* TCP segment data length */
struct tcphdr *tqe_th; /* a pointer to tcp header */
struct mbuf *tqe_m; /* mbuf contains packet */
};
LIST_HEAD(tsegqe_head, tseg_qent);
struct sackblk {
tcp_seq start; /* start seq no. of sack block */
tcp_seq end; /* end seq no. */
@ -100,7 +91,7 @@ do { \
* Organized for 16 byte cacheline efficiency.
*/
struct tcpcb {
struct tsegqe_head t_segq; /* segment reassembly queue */
struct mbuf *t_segq; /* segment reassembly queue */
void *t_pspare[2]; /* new reassembly queue */
int t_segqlen; /* segment reassembly queue length */
int t_dupacks; /* consecutive dup acks recd */
@ -663,11 +654,7 @@ char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
void tcp_reass_init(void);
void tcp_reass_flush(struct tcpcb *);
#ifdef VIMAGE
void tcp_reass_destroy(void);
#endif
void tcp_input(struct mbuf *, int);
u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);

View File

@ -159,6 +159,7 @@ struct pkthdr {
#define tso_segsz PH_per.sixteen[1]
#define csum_phsum PH_per.sixteen[2]
#define csum_data PH_per.thirtytwo[1]
#define pkt_tcphdr PH_loc.ptr
/*
* Description of external storage mapped into mbuf; valid only if M_EXT is