1
0
mirror of https://git.FreeBSD.org/src.git synced 2025-01-16 15:11:52 +00:00

Refactor the NFS over UDP retransmit timeout estimation logic to allow

the estimator to be more easily tuned and maintained.

There should be no functional change except there is now a lower limit
on the retransmit timeout to prevent the client from retransmitting
faster than the server's disks can fill requests, and an upper limit
to prevent the estimator from taking to long to retransmit during a
server outage.

Reviewed by:	mohan, kris, silby
Sponsored by:	Network Appliance, Incorporated
This commit is contained in:
Chuck Lever 2006-05-23 18:33:58 +00:00
parent 6a09faf2cb
commit 94163ea283
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=158859
3 changed files with 158 additions and 62 deletions

View File

@ -257,6 +257,31 @@ extern int nfs_debug;
#endif #endif
/*
* On fast networks, the estimator will try to reduce the
* timeout lower than the latency of the server's disks,
* which results in too many timeouts, so cap the lower
* bound.
*/
#define NFS_MINRTO (NFS_HZ >> 2)
/*
* Keep the RTO from increasing to unreasonably large values
* when a server is not responding.
*/
#define NFS_MAXRTO (20 * NFS_HZ)
enum nfs_rto_timer_t {
NFS_DEFAULT_TIMER,
NFS_GETATTR_TIMER,
NFS_LOOKUP_TIMER,
NFS_READ_TIMER,
NFS_WRITE_TIMER,
};
#define NFS_MAX_TIMER (NFS_WRITE_TIMER)
#define NFS_INITRTT (NFS_HZ << 3)
vfs_init_t nfs_init; vfs_init_t nfs_init;
vfs_uninit_t nfs_uninit; vfs_uninit_t nfs_uninit;
int nfs_mountroot(struct mount *mp, struct thread *td); int nfs_mountroot(struct mount *mp, struct thread *td);

View File

@ -79,38 +79,6 @@ __FBSDID("$FreeBSD$");
extern u_int32_t nfs_xid; extern u_int32_t nfs_xid;
/*
* Estimate rto for an nfs rpc sent via. an unreliable datagram.
* Use the mean and mean deviation of rtt for the appropriate type of rpc
* for the frequent rpcs and a default for the others.
* The justification for doing "other" this way is that these rpcs
* happen so infrequently that timer est. would probably be stale.
* Also, since many of these rpcs are
* non-idempotent, a conservative timeout is desired.
* getattr, lookup - A+2D
* read, write - A+4D
* other - nm_timeo
*/
#define NFS_RTO(n, t) \
((t) == 0 ? (n)->nm_timeo : \
((t) < 3 ? \
(((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
/*
* Defines which timer to use for the procnum.
* 0 - default
* 1 - getattr
* 2 - lookup
* 3 - read
* 4 - write
*/
static int proct[NFS_NPROCS] = {
0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
};
static int nfs_realign_test; static int nfs_realign_test;
static int nfs_realign_count; static int nfs_realign_count;
static int nfs_bufpackets = 4; static int nfs_bufpackets = 4;
@ -156,6 +124,132 @@ static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
extern struct mtx nfs_reqq_mtx; extern struct mtx nfs_reqq_mtx;
/*
* RTT estimator
*/
static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
NFS_DEFAULT_TIMER, /* NULL */
NFS_GETATTR_TIMER, /* GETATTR */
NFS_DEFAULT_TIMER, /* SETATTR */
NFS_LOOKUP_TIMER, /* LOOKUP */
NFS_GETATTR_TIMER, /* ACCESS */
NFS_READ_TIMER, /* READLINK */
NFS_READ_TIMER, /* READ */
NFS_WRITE_TIMER, /* WRITE */
NFS_DEFAULT_TIMER, /* CREATE */
NFS_DEFAULT_TIMER, /* MKDIR */
NFS_DEFAULT_TIMER, /* SYMLINK */
NFS_DEFAULT_TIMER, /* MKNOD */
NFS_DEFAULT_TIMER, /* REMOVE */
NFS_DEFAULT_TIMER, /* RMDIR */
NFS_DEFAULT_TIMER, /* RENAME */
NFS_DEFAULT_TIMER, /* LINK */
NFS_READ_TIMER, /* READDIR */
NFS_READ_TIMER, /* READDIRPLUS */
NFS_DEFAULT_TIMER, /* FSSTAT */
NFS_DEFAULT_TIMER, /* FSINFO */
NFS_DEFAULT_TIMER, /* PATHCONF */
NFS_DEFAULT_TIMER, /* COMMIT */
NFS_DEFAULT_TIMER, /* NOOP */
};
/*
* Choose the correct RTT timer for this NFS procedure.
*/
static inline enum nfs_rto_timer_t
nfs_rto_timer(u_int32_t procnum)
{
return nfs_proct[procnum];
}
/*
* Initialize the RTT estimator state for a new mount point.
*/
static void
nfs_init_rtt(struct nfsmount *nmp)
{
int i;
for (i = 0; i < NFS_MAX_TIMER; i++)
nmp->nm_srtt[i] = NFS_INITRTT;
for (i = 0; i < NFS_MAX_TIMER; i++)
nmp->nm_sdrtt[i] = 0;
}
/*
* Update a mount point's RTT estimator state using data from the
* passed-in request.
*
* Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
*
* NB: Since the timer resolution of NFS_HZ is so course, it can often
* result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
* between N + dt and N + 2 - dt ticks, add 1 before calculating the
* update values.
*/
static void
nfs_update_rtt(struct nfsreq *rep)
{
int t1 = rep->r_rtt + 1;
int index = nfs_rto_timer(rep->r_procnum) - 1;
int *srtt = &rep->r_nmp->nm_srtt[index];
int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
t1 -= *srtt >> 3;
*srtt += t1;
if (t1 < 0)
t1 = -t1;
t1 -= *sdrtt >> 2;
*sdrtt += t1;
}
/*
* Estimate RTO for an NFS RPC sent via an unreliable datagram.
*
* Use the mean and mean deviation of RTT for the appropriate type
* of RPC for the frequent RPCs and a default for the others.
* The justification for doing "other" this way is that these RPCs
* happen so infrequently that timer est. would probably be stale.
* Also, since many of these RPCs are non-idempotent, a conservative
* timeout is desired.
*
* getattr, lookup - A+2D
* read, write - A+4D
* other - nm_timeo
*/
static int
nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
{
enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
int index = timer - 1;
int rto;
switch (timer) {
case NFS_GETATTR_TIMER:
case NFS_LOOKUP_TIMER:
rto = ((nmp->nm_srtt[index] + 3) >> 2) +
((nmp->nm_sdrtt[index] + 1) >> 1);
break;
case NFS_READ_TIMER:
case NFS_WRITE_TIMER:
rto = ((nmp->nm_srtt[index] + 7) >> 3) +
(nmp->nm_sdrtt[index] + 1);
break;
default:
rto = nmp->nm_timeo;
return (rto);
}
if (rto < NFS_MINRTO)
rto = NFS_MINRTO;
else if (rto > NFS_MAXRTO)
rto = NFS_MAXRTO;
return (rto);
}
/* /*
* Initialize sockets and congestion for a new NFS connection. * Initialize sockets and congestion for a new NFS connection.
* We do not free the sockaddr if error. * We do not free the sockaddr if error.
@ -357,10 +451,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
mtx_lock(&nmp->nm_mtx); mtx_lock(&nmp->nm_mtx);
/* Initialize other non-zero congestion variables */ /* Initialize other non-zero congestion variables */
nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nfs_init_rtt(nmp);
nmp->nm_srtt[3] = (NFS_TIMEO << 3);
nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
nmp->nm_sdrtt[3] = 0;
nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
nmp->nm_sent = 0; nmp->nm_sent = 0;
nmp->nm_timeouts = 0; nmp->nm_timeouts = 0;
@ -685,7 +776,6 @@ nfs_clnt_match_xid(struct socket *so,
caddr_t dpos; caddr_t dpos;
u_int32_t rxid, *tl; u_int32_t rxid, *tl;
struct nfsreq *rep; struct nfsreq *rep;
register int32_t t1;
int error; int error;
/* /*
@ -743,27 +833,8 @@ nfs_clnt_match_xid(struct socket *so,
rep->r_flags &= ~R_SENT; rep->r_flags &= ~R_SENT;
nmp->nm_sent -= NFS_CWNDSCALE; nmp->nm_sent -= NFS_CWNDSCALE;
} }
/* if (rep->r_flags & R_TIMING)
* Update rtt using a gain of 0.125 on the mean nfs_update_rtt(rep);
* and a gain of 0.25 on the deviation.
*/
if (rep->r_flags & R_TIMING) {
/*
* Since the timer resolution of
* NFS_HZ is so course, it can often
* result in r_rtt == 0. Since
* r_rtt == N means that the actual
* rtt is between N+dt and N+2-dt ticks,
* add 1.
*/
t1 = rep->r_rtt + 1;
t1 -= (NFS_SRTT(rep) >> 3);
NFS_SRTT(rep) += t1;
if (t1 < 0)
t1 = -t1;
t1 -= (NFS_SDRTT(rep) >> 2);
NFS_SDRTT(rep) += t1;
}
nmp->nm_timeouts = 0; nmp->nm_timeouts = 0;
wakeup((caddr_t)rep); wakeup((caddr_t)rep);
mtx_unlock(&rep->r_mtx); mtx_unlock(&rep->r_mtx);
@ -1073,7 +1144,7 @@ nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
else else
rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
rep->r_rtt = rep->r_rexmit = 0; rep->r_rtt = rep->r_rexmit = 0;
if (proct[procnum] > 0) if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
rep->r_flags = R_TIMING; rep->r_flags = R_TIMING;
else else
rep->r_flags = 0; rep->r_flags = 0;
@ -1328,7 +1399,7 @@ nfs_timer(void *arg)
if (nmp->nm_flag & NFSMNT_DUMBTIMR) if (nmp->nm_flag & NFSMNT_DUMBTIMR)
timeo = nmp->nm_timeo; timeo = nmp->nm_timeo;
else else
timeo = NFS_RTO(nmp, proct[rep->r_procnum]); timeo = nfs_estimate_rto(nmp, rep->r_procnum);
if (nmp->nm_timeouts > 0) if (nmp->nm_timeouts > 0)
timeo *= nfs_backoff[nmp->nm_timeouts - 1]; timeo *= nfs_backoff[nmp->nm_timeouts - 1];
if (rep->r_rtt <= timeo) { if (rep->r_rtt <= timeo) {

View File

@ -64,8 +64,8 @@ struct nfsmount {
struct sockaddr *nm_nam; /* Addr of server */ struct sockaddr *nm_nam; /* Addr of server */
int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */
int nm_retry; /* Max retries */ int nm_retry; /* Max retries */
int nm_srtt[4]; /* Timers for rpcs */ int nm_srtt[NFS_MAX_TIMER], /* RTT Timers for rpcs */
int nm_sdrtt[4]; nm_sdrtt[NFS_MAX_TIMER];
int nm_sent; /* Request send count */ int nm_sent; /* Request send count */
int nm_cwnd; /* Request send window */ int nm_cwnd; /* Request send window */
int nm_timeouts; /* Request timeouts */ int nm_timeouts; /* Request timeouts */