Fix a serious deadlock with the NFS client. Given a large enough

atomic write request, it can fill the buffer cache with the entirety of that write in order to handle retries. However, it never drops the vnode lock, or else it wouldn't be atomic, so it ends up waiting indefinitely for more buf memory that cannot be gotten as it has it all, and it waits in an uncancellable state. To fix this, hibufspace is exported and scaled to a reasonable fraction. This is used as the limit of how much of an atomic write request by the NFS client will be handled asynchronously. If the request is larger than this, it will be turned into a synchronous request which won't deadlock the system. It's possible this value is far off from what is required by some, so it shall be tunable as soon as mount_nfs(8) learns of the new field. The slowdown between an asynchronous and a synchronous write on NFS appears to be on the order of 2x-4x. General nod by: gad MFC after: 2 weeks More testing: wes PR: kern/79208
svn path=/head/; revision=147280
2024-12-28 11:57:28 +00:00 · 2005-06-10 23:50:41 +00:00 · 2005-06-10 23:50:41 +00:00 · cc3149b1ea · 2020-12-20 02:59:44 +00:00
commit cc3149b1ea
parent c7c0d2e3e4
7 changed files with 81 additions and 5 deletions
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -127,7 +127,7 @@ SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
 static int lobufspace;
 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
    "Minimum amount of buffers we want to have");
-static int hibufspace;
+int hibufspace;
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
    "Maximum allowed value of bufspace (excluding buf_daemon)");
 static int bufreusecnt;
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@ -873,6 +873,14 @@ nfs_write(struct vop_write_args *ap)
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
+			/*
+			 * Require non-blocking, synchronous writes to
+			 * dirty files to inform the program it needs
+			 * to fsync(2) explicitly.
+			 */
+			if (ioflag & IO_NDELAY)
+				return (EAGAIN);
+flush_and_restart:
 			np->n_attrstamp = 0;
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
@ -953,6 +961,63 @@ nfs_write(struct vop_write_args *ap)
 	}

 	biosize = vp->v_mount->mnt_stat.f_iosize;
+	/*
+	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
+	 * would exceed the local maximum per-file write commit size when
+	 * combined with those, we must decide whether to flush,
+	 * go synchronous, or return error.  We don't bother checking
+	 * IO_UNIT -- we just make all writes atomic anyway, as there's
+	 * no point optimizing for something that really won't ever happen.
+	 */
+	if (!(ioflag & IO_SYNC)) {
+		int needrestart = 0;
+		if (nmp->nm_wcommitsize < uio->uio_resid) {
+			/*
+			 * If this request could not possibly be completed
+			 * without exceeding the maximum outstanding write
+			 * commit size, see if we can convert it into a
+			 * synchronous write operation.
+			 */
+			if (ioflag & IO_NDELAY)
+				return (EAGAIN);
+			ioflag |= IO_SYNC;
+			if (np->n_flag & NMODIFIED)
+				needrestart = 1;
+		} else if (np->n_flag & NMODIFIED) {
+			int wouldcommit = 0;
+			BO_LOCK(&vp->v_bufobj);
+			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
+				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
+				    b_bobufs) {
+					if (bp->b_flags & B_NEEDCOMMIT)
+						wouldcommit += bp->b_bcount;
+				}
+			}
+			BO_UNLOCK(&vp->v_bufobj);
+			/*
+			 * Since we're not operating synchronously and
+			 * bypassing the buffer cache, we are in a commit
+			 * and holding all of these buffers whether
+			 * transmitted or not.  If not limited, this
+			 * will lead to the buffer cache deadlocking,
+			 * as no one else can flush our uncommitted buffers.
+			 */
+			wouldcommit += uio->uio_resid;
+			/*
+			 * If we would initially exceed the maximum
+			 * outstanding write commit size, flush and restart.
+			 */
+			if (wouldcommit > nmp->nm_wcommitsize)
+				needrestart = 1;
+		}
+		if (needrestart) {
+			if (haverslock) {
+				nfs_rsunlock(np, td);
+				haverslock = 0;
+			}
+			goto flush_and_restart;
+		}
+	}

 	do {
 		nfsstats.biocache_writes++;
--- a/sys/nfsclient/nfs_vfsops.c
+++ b/sys/nfsclient/nfs_vfsops.c
@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
@ -633,6 +635,12 @@ nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp)
 		else
 			nmp->nm_readahead = NFS_MAXRAHEAD;
 	}
+	if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
+		if (argp->wcommitsize < nmp->nm_wsize)
+			nmp->nm_wcommitsize = nmp->nm_wsize;
+		else
+			nmp->nm_wcommitsize = argp->wcommitsize;
+	}
 	if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
 		if (argp->deadthresh <= NFS_MAXDEADTHRESH)
 			nmp->nm_deadthresh = argp->deadthresh;
@ -815,6 +823,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
 		nmp->nm_wsize = NFS_WSIZE;
 		nmp->nm_rsize = NFS_RSIZE;
 	}
+	nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
 	nmp->nm_readdirsize = NFS_READDIRSIZE;
 	nmp->nm_numgrps = NFS_MAXGRPS;
 	nmp->nm_readahead = NFS_DEFRAHEAD;
--- a/sys/nfsclient/nfsargs.h
+++ b/sys/nfsclient/nfsargs.h
@ -56,7 +56,7 @@ struct nfs_args {
 	int		retrans;	/* times to retry send */
 	int		maxgrouplist;	/* Max. size of group list */
 	int		readahead;	/* # of blocks to readahead */
-	int		__pad1;		/* was "leaseterm" */
+	int		wcommitsize;	/* Max. write commit size in bytes */
 	int		deadthresh;	/* Retrans threshold */
 	char		*hostname;	/* server's name */
 	int		acregmin;	/* cache attrs for reg files min time */
@ -80,7 +80,7 @@ struct nfs_args {
 #define	NFSMNT_NFSV3		0x00000200  /* Use NFS Version 3 protocol */
 /* 0x400 free, was NFSMNT_KERB */
 #define	NFSMNT_DUMBTIMR		0x00000800  /* Don't estimate rtt dynamically */
-/* 0x1000 free, was NFSMNT_LEASETERM */
+#define	NFSMNT_WCOMMITSIZE	0x00001000  /* set max write commit size */
 #define	NFSMNT_READAHEAD	0x00002000  /* set read ahead */
 #define	NFSMNT_DEADTHRESH	0x00004000  /* set dead server retry thresh */
 #define	NFSMNT_RESVPORT		0x00008000  /* Allocate a reserved port */
--- a/sys/nfsclient/nfsmount.h
+++ b/sys/nfsclient/nfsmount.h
@ -74,6 +74,7 @@ struct	nfsmount {
 	int	nm_wsize;		/* Max size of write rpc */
 	int	nm_readdirsize;		/* Size of a readdir rpc */
 	int	nm_readahead;		/* Num. of blocks to readahead */
+	int	nm_wcommitsize;		/* Max size of commit for write */
 	int	nm_acdirmin;		/* Directory attr cache min lifetime */
 	int	nm_acdirmax;		/* Directory attr cache max lifetime */
 	int	nm_acregmin;		/* Reg file attr cache min lifetime */
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -465,6 +465,7 @@ extern int	nbuf;			/* The number of buffer headers */
 extern int	maxswzone;		/* Max KVA for swap structures */
 extern int	maxbcache;		/* Max KVA for buffer cache */
 extern int	runningbufspace;
+extern int	hibufspace;
 extern int      buf_maxio;              /* nominal maximum I/O for buffer */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
--- a/sys/sys/bufobj.h
+++ b/sys/sys/bufobj.h
@ -109,13 +109,13 @@ struct bufobj {

 #define	BO_LOCK(bo) \
 	do { \
-		KASSERT (bo->bo_mtx != NULL, ("No lock in bufobj")); \
+		KASSERT((bo)->bo_mtx != NULL, ("No lock in bufobj")); \
 		mtx_lock((bo)->bo_mtx); \
 	} while (0)

 #define BO_UNLOCK(bo) \
 	do { \
-		KASSERT (bo->bo_mtx != NULL, ("No lock in bufobj")); \
+		KASSERT((bo)->bo_mtx != NULL, ("No lock in bufobj")); \
 		mtx_unlock((bo)->bo_mtx); \
 	} while (0)