mirror of
https://git.FreeBSD.org/src.git
synced 2025-01-16 15:11:52 +00:00
Refactor the NFS over UDP retransmit timeout estimation logic to allow
the estimator to be more easily tuned and maintained. There should be no functional change except there is now a lower limit on the retransmit timeout to prevent the client from retransmitting faster than the server's disks can fill requests, and an upper limit to prevent the estimator from taking to long to retransmit during a server outage. Reviewed by: mohan, kris, silby Sponsored by: Network Appliance, Incorporated
This commit is contained in:
parent
6a09faf2cb
commit
94163ea283
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=158859
@ -257,6 +257,31 @@ extern int nfs_debug;
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On fast networks, the estimator will try to reduce the
|
||||||
|
* timeout lower than the latency of the server's disks,
|
||||||
|
* which results in too many timeouts, so cap the lower
|
||||||
|
* bound.
|
||||||
|
*/
|
||||||
|
#define NFS_MINRTO (NFS_HZ >> 2)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Keep the RTO from increasing to unreasonably large values
|
||||||
|
* when a server is not responding.
|
||||||
|
*/
|
||||||
|
#define NFS_MAXRTO (20 * NFS_HZ)
|
||||||
|
|
||||||
|
enum nfs_rto_timer_t {
|
||||||
|
NFS_DEFAULT_TIMER,
|
||||||
|
NFS_GETATTR_TIMER,
|
||||||
|
NFS_LOOKUP_TIMER,
|
||||||
|
NFS_READ_TIMER,
|
||||||
|
NFS_WRITE_TIMER,
|
||||||
|
};
|
||||||
|
#define NFS_MAX_TIMER (NFS_WRITE_TIMER)
|
||||||
|
|
||||||
|
#define NFS_INITRTT (NFS_HZ << 3)
|
||||||
|
|
||||||
vfs_init_t nfs_init;
|
vfs_init_t nfs_init;
|
||||||
vfs_uninit_t nfs_uninit;
|
vfs_uninit_t nfs_uninit;
|
||||||
int nfs_mountroot(struct mount *mp, struct thread *td);
|
int nfs_mountroot(struct mount *mp, struct thread *td);
|
||||||
|
@ -79,38 +79,6 @@ __FBSDID("$FreeBSD$");
|
|||||||
|
|
||||||
extern u_int32_t nfs_xid;
|
extern u_int32_t nfs_xid;
|
||||||
|
|
||||||
/*
|
|
||||||
* Estimate rto for an nfs rpc sent via. an unreliable datagram.
|
|
||||||
* Use the mean and mean deviation of rtt for the appropriate type of rpc
|
|
||||||
* for the frequent rpcs and a default for the others.
|
|
||||||
* The justification for doing "other" this way is that these rpcs
|
|
||||||
* happen so infrequently that timer est. would probably be stale.
|
|
||||||
* Also, since many of these rpcs are
|
|
||||||
* non-idempotent, a conservative timeout is desired.
|
|
||||||
* getattr, lookup - A+2D
|
|
||||||
* read, write - A+4D
|
|
||||||
* other - nm_timeo
|
|
||||||
*/
|
|
||||||
#define NFS_RTO(n, t) \
|
|
||||||
((t) == 0 ? (n)->nm_timeo : \
|
|
||||||
((t) < 3 ? \
|
|
||||||
(((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
|
|
||||||
((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
|
|
||||||
#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
|
|
||||||
#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Defines which timer to use for the procnum.
|
|
||||||
* 0 - default
|
|
||||||
* 1 - getattr
|
|
||||||
* 2 - lookup
|
|
||||||
* 3 - read
|
|
||||||
* 4 - write
|
|
||||||
*/
|
|
||||||
static int proct[NFS_NPROCS] = {
|
|
||||||
0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
static int nfs_realign_test;
|
static int nfs_realign_test;
|
||||||
static int nfs_realign_count;
|
static int nfs_realign_count;
|
||||||
static int nfs_bufpackets = 4;
|
static int nfs_bufpackets = 4;
|
||||||
@ -156,6 +124,132 @@ static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
|
|||||||
|
|
||||||
extern struct mtx nfs_reqq_mtx;
|
extern struct mtx nfs_reqq_mtx;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RTT estimator
|
||||||
|
*/
|
||||||
|
|
||||||
|
static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
|
||||||
|
NFS_DEFAULT_TIMER, /* NULL */
|
||||||
|
NFS_GETATTR_TIMER, /* GETATTR */
|
||||||
|
NFS_DEFAULT_TIMER, /* SETATTR */
|
||||||
|
NFS_LOOKUP_TIMER, /* LOOKUP */
|
||||||
|
NFS_GETATTR_TIMER, /* ACCESS */
|
||||||
|
NFS_READ_TIMER, /* READLINK */
|
||||||
|
NFS_READ_TIMER, /* READ */
|
||||||
|
NFS_WRITE_TIMER, /* WRITE */
|
||||||
|
NFS_DEFAULT_TIMER, /* CREATE */
|
||||||
|
NFS_DEFAULT_TIMER, /* MKDIR */
|
||||||
|
NFS_DEFAULT_TIMER, /* SYMLINK */
|
||||||
|
NFS_DEFAULT_TIMER, /* MKNOD */
|
||||||
|
NFS_DEFAULT_TIMER, /* REMOVE */
|
||||||
|
NFS_DEFAULT_TIMER, /* RMDIR */
|
||||||
|
NFS_DEFAULT_TIMER, /* RENAME */
|
||||||
|
NFS_DEFAULT_TIMER, /* LINK */
|
||||||
|
NFS_READ_TIMER, /* READDIR */
|
||||||
|
NFS_READ_TIMER, /* READDIRPLUS */
|
||||||
|
NFS_DEFAULT_TIMER, /* FSSTAT */
|
||||||
|
NFS_DEFAULT_TIMER, /* FSINFO */
|
||||||
|
NFS_DEFAULT_TIMER, /* PATHCONF */
|
||||||
|
NFS_DEFAULT_TIMER, /* COMMIT */
|
||||||
|
NFS_DEFAULT_TIMER, /* NOOP */
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Choose the correct RTT timer for this NFS procedure.
|
||||||
|
*/
|
||||||
|
static inline enum nfs_rto_timer_t
|
||||||
|
nfs_rto_timer(u_int32_t procnum)
|
||||||
|
{
|
||||||
|
return nfs_proct[procnum];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize the RTT estimator state for a new mount point.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
nfs_init_rtt(struct nfsmount *nmp)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < NFS_MAX_TIMER; i++)
|
||||||
|
nmp->nm_srtt[i] = NFS_INITRTT;
|
||||||
|
for (i = 0; i < NFS_MAX_TIMER; i++)
|
||||||
|
nmp->nm_sdrtt[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update a mount point's RTT estimator state using data from the
|
||||||
|
* passed-in request.
|
||||||
|
*
|
||||||
|
* Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
|
||||||
|
*
|
||||||
|
* NB: Since the timer resolution of NFS_HZ is so course, it can often
|
||||||
|
* result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
|
||||||
|
* between N + dt and N + 2 - dt ticks, add 1 before calculating the
|
||||||
|
* update values.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
nfs_update_rtt(struct nfsreq *rep)
|
||||||
|
{
|
||||||
|
int t1 = rep->r_rtt + 1;
|
||||||
|
int index = nfs_rto_timer(rep->r_procnum) - 1;
|
||||||
|
int *srtt = &rep->r_nmp->nm_srtt[index];
|
||||||
|
int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
|
||||||
|
|
||||||
|
t1 -= *srtt >> 3;
|
||||||
|
*srtt += t1;
|
||||||
|
if (t1 < 0)
|
||||||
|
t1 = -t1;
|
||||||
|
t1 -= *sdrtt >> 2;
|
||||||
|
*sdrtt += t1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Estimate RTO for an NFS RPC sent via an unreliable datagram.
|
||||||
|
*
|
||||||
|
* Use the mean and mean deviation of RTT for the appropriate type
|
||||||
|
* of RPC for the frequent RPCs and a default for the others.
|
||||||
|
* The justification for doing "other" this way is that these RPCs
|
||||||
|
* happen so infrequently that timer est. would probably be stale.
|
||||||
|
* Also, since many of these RPCs are non-idempotent, a conservative
|
||||||
|
* timeout is desired.
|
||||||
|
*
|
||||||
|
* getattr, lookup - A+2D
|
||||||
|
* read, write - A+4D
|
||||||
|
* other - nm_timeo
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
|
||||||
|
{
|
||||||
|
enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
|
||||||
|
int index = timer - 1;
|
||||||
|
int rto;
|
||||||
|
|
||||||
|
switch (timer) {
|
||||||
|
case NFS_GETATTR_TIMER:
|
||||||
|
case NFS_LOOKUP_TIMER:
|
||||||
|
rto = ((nmp->nm_srtt[index] + 3) >> 2) +
|
||||||
|
((nmp->nm_sdrtt[index] + 1) >> 1);
|
||||||
|
break;
|
||||||
|
case NFS_READ_TIMER:
|
||||||
|
case NFS_WRITE_TIMER:
|
||||||
|
rto = ((nmp->nm_srtt[index] + 7) >> 3) +
|
||||||
|
(nmp->nm_sdrtt[index] + 1);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
rto = nmp->nm_timeo;
|
||||||
|
return (rto);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rto < NFS_MINRTO)
|
||||||
|
rto = NFS_MINRTO;
|
||||||
|
else if (rto > NFS_MAXRTO)
|
||||||
|
rto = NFS_MAXRTO;
|
||||||
|
|
||||||
|
return (rto);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize sockets and congestion for a new NFS connection.
|
* Initialize sockets and congestion for a new NFS connection.
|
||||||
* We do not free the sockaddr if error.
|
* We do not free the sockaddr if error.
|
||||||
@ -357,10 +451,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
|
|||||||
|
|
||||||
mtx_lock(&nmp->nm_mtx);
|
mtx_lock(&nmp->nm_mtx);
|
||||||
/* Initialize other non-zero congestion variables */
|
/* Initialize other non-zero congestion variables */
|
||||||
nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
|
nfs_init_rtt(nmp);
|
||||||
nmp->nm_srtt[3] = (NFS_TIMEO << 3);
|
|
||||||
nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
|
|
||||||
nmp->nm_sdrtt[3] = 0;
|
|
||||||
nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
|
nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
|
||||||
nmp->nm_sent = 0;
|
nmp->nm_sent = 0;
|
||||||
nmp->nm_timeouts = 0;
|
nmp->nm_timeouts = 0;
|
||||||
@ -685,7 +776,6 @@ nfs_clnt_match_xid(struct socket *so,
|
|||||||
caddr_t dpos;
|
caddr_t dpos;
|
||||||
u_int32_t rxid, *tl;
|
u_int32_t rxid, *tl;
|
||||||
struct nfsreq *rep;
|
struct nfsreq *rep;
|
||||||
register int32_t t1;
|
|
||||||
int error;
|
int error;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -743,27 +833,8 @@ nfs_clnt_match_xid(struct socket *so,
|
|||||||
rep->r_flags &= ~R_SENT;
|
rep->r_flags &= ~R_SENT;
|
||||||
nmp->nm_sent -= NFS_CWNDSCALE;
|
nmp->nm_sent -= NFS_CWNDSCALE;
|
||||||
}
|
}
|
||||||
/*
|
if (rep->r_flags & R_TIMING)
|
||||||
* Update rtt using a gain of 0.125 on the mean
|
nfs_update_rtt(rep);
|
||||||
* and a gain of 0.25 on the deviation.
|
|
||||||
*/
|
|
||||||
if (rep->r_flags & R_TIMING) {
|
|
||||||
/*
|
|
||||||
* Since the timer resolution of
|
|
||||||
* NFS_HZ is so course, it can often
|
|
||||||
* result in r_rtt == 0. Since
|
|
||||||
* r_rtt == N means that the actual
|
|
||||||
* rtt is between N+dt and N+2-dt ticks,
|
|
||||||
* add 1.
|
|
||||||
*/
|
|
||||||
t1 = rep->r_rtt + 1;
|
|
||||||
t1 -= (NFS_SRTT(rep) >> 3);
|
|
||||||
NFS_SRTT(rep) += t1;
|
|
||||||
if (t1 < 0)
|
|
||||||
t1 = -t1;
|
|
||||||
t1 -= (NFS_SDRTT(rep) >> 2);
|
|
||||||
NFS_SDRTT(rep) += t1;
|
|
||||||
}
|
|
||||||
nmp->nm_timeouts = 0;
|
nmp->nm_timeouts = 0;
|
||||||
wakeup((caddr_t)rep);
|
wakeup((caddr_t)rep);
|
||||||
mtx_unlock(&rep->r_mtx);
|
mtx_unlock(&rep->r_mtx);
|
||||||
@ -1073,7 +1144,7 @@ nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
|
|||||||
else
|
else
|
||||||
rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
|
rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
|
||||||
rep->r_rtt = rep->r_rexmit = 0;
|
rep->r_rtt = rep->r_rexmit = 0;
|
||||||
if (proct[procnum] > 0)
|
if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
|
||||||
rep->r_flags = R_TIMING;
|
rep->r_flags = R_TIMING;
|
||||||
else
|
else
|
||||||
rep->r_flags = 0;
|
rep->r_flags = 0;
|
||||||
@ -1328,7 +1399,7 @@ nfs_timer(void *arg)
|
|||||||
if (nmp->nm_flag & NFSMNT_DUMBTIMR)
|
if (nmp->nm_flag & NFSMNT_DUMBTIMR)
|
||||||
timeo = nmp->nm_timeo;
|
timeo = nmp->nm_timeo;
|
||||||
else
|
else
|
||||||
timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
|
timeo = nfs_estimate_rto(nmp, rep->r_procnum);
|
||||||
if (nmp->nm_timeouts > 0)
|
if (nmp->nm_timeouts > 0)
|
||||||
timeo *= nfs_backoff[nmp->nm_timeouts - 1];
|
timeo *= nfs_backoff[nmp->nm_timeouts - 1];
|
||||||
if (rep->r_rtt <= timeo) {
|
if (rep->r_rtt <= timeo) {
|
||||||
|
@ -64,8 +64,8 @@ struct nfsmount {
|
|||||||
struct sockaddr *nm_nam; /* Addr of server */
|
struct sockaddr *nm_nam; /* Addr of server */
|
||||||
int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */
|
int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */
|
||||||
int nm_retry; /* Max retries */
|
int nm_retry; /* Max retries */
|
||||||
int nm_srtt[4]; /* Timers for rpcs */
|
int nm_srtt[NFS_MAX_TIMER], /* RTT Timers for rpcs */
|
||||||
int nm_sdrtt[4];
|
nm_sdrtt[NFS_MAX_TIMER];
|
||||||
int nm_sent; /* Request send count */
|
int nm_sent; /* Request send count */
|
||||||
int nm_cwnd; /* Request send window */
|
int nm_cwnd; /* Request send window */
|
||||||
int nm_timeouts; /* Request timeouts */
|
int nm_timeouts; /* Request timeouts */
|
||||||
|
Loading…
Reference in New Issue
Block a user