mirror of
https://git.FreeBSD.org/src.git
synced 2024-12-15 10:17:20 +00:00
f31dba4c5d
if a single process is performing a large number of requests (in this case writing a large file). The writing process could monopolise the recieve lock and prevent any other processes from recieving their replies. It also adds a new sysctl variable 'vfs.nfs.dwrite' which controls the behaviour which originally pointed out the problem. When a process writes to a file over NFS, it usually arranges for another process (the 'iod') to perform the request. If no iods are available, then it turns the write into a 'delayed write' which is later picked up by the next iod to do a write request for that file. This can cause that particular iod to do a disproportionate number of requests from a single process which can harm performance on some NFS servers. The alternative is to perform the write synchronously in the context of the original writing process if no iod is avaiable for asynchronous writing. The 'delayed write' behaviour is selected when vfs.nfs.dwrite=1 and the non-delayed behaviour is selected when vfs.nfs.dwrite=0. The default is vfs.nfs.dwrite=1; if many people tell me that performance is better if vfs.nfs.dwrite=0 then I will change the default. Submitted by: Hidetoshi Shimokawa <simokawa@sat.t.u-tokyo.ac.jp>
973 lines
26 KiB
C
973 lines
26 KiB
C
/*
|
|
* Copyright (c) 1989, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
*
|
|
* This code is derived from software contributed to Berkeley by
|
|
* Rick Macklem at The University of Guelph.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed by the University of
|
|
* California, Berkeley and its contributors.
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94
|
|
* $Id: nfs_bio.c,v 1.25 1996/09/19 18:20:54 nate Exp $
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/resourcevar.h>
|
|
#include <sys/signalvar.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/vnode.h>
|
|
#include <sys/mount.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/sysctl.h>
|
|
|
|
#include <vm/vm.h>
|
|
#include <vm/vm_param.h>
|
|
#include <vm/vm_extern.h>
|
|
|
|
#include <nfs/rpcv2.h>
|
|
#include <nfs/nfsproto.h>
|
|
#include <nfs/nfs.h>
|
|
#include <nfs/nfsmount.h>
|
|
#include <nfs/nqnfs.h>
|
|
#include <nfs/nfsnode.h>
|
|
|
|
static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
|
|
struct proc *p));
|
|
|
|
extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
|
|
extern int nfs_numasync;
|
|
extern struct nfsstats nfsstats;
|
|
|
|
int nfs_dwrite = 1;
|
|
SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, "");
|
|
|
|
/*
|
|
* Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
|
|
* that this isn't done inside getblk() and brelse() so these calls
|
|
* wouldn't need to be here.
|
|
*/
|
|
#ifdef B_VMIO
|
|
#define vnode_pager_uncache(vp)
|
|
#else
|
|
#define vfs_busy_pages(bp, f)
|
|
#define vfs_unbusy_pages(bp)
|
|
#define vfs_dirty_pages(bp)
|
|
#endif
|
|
|
|
/*
|
|
* Vnode op for read using bio
|
|
* Any similarity to readip() is purely coincidental
|
|
*/
|
|
int
|
|
nfs_bioread(vp, uio, ioflag, cred)
|
|
register struct vnode *vp;
|
|
register struct uio *uio;
|
|
int ioflag;
|
|
struct ucred *cred;
|
|
{
|
|
register struct nfsnode *np = VTONFS(vp);
|
|
register int biosize, diff, i;
|
|
struct buf *bp = 0, *rabp;
|
|
struct vattr vattr;
|
|
struct proc *p;
|
|
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
|
|
daddr_t lbn, rabn;
|
|
int bufsize;
|
|
int nra, error = 0, n = 0, on = 0, not_readin;
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (uio->uio_rw != UIO_READ)
|
|
panic("nfs_read mode");
|
|
#endif
|
|
if (uio->uio_resid == 0)
|
|
return (0);
|
|
if (uio->uio_offset < 0)
|
|
return (EINVAL);
|
|
p = uio->uio_procp;
|
|
if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
|
|
(void)nfs_fsinfo(nmp, vp, cred, p);
|
|
biosize = vp->v_mount->mnt_stat.f_iosize;
|
|
/*
|
|
* For nfs, cache consistency can only be maintained approximately.
|
|
* Although RFC1094 does not specify the criteria, the following is
|
|
* believed to be compatible with the reference port.
|
|
* For nqnfs, full cache consistency is maintained within the loop.
|
|
* For nfs:
|
|
* If the file's modify time on the server has changed since the
|
|
* last read rpc or you have written to the file,
|
|
* you may have lost data cache consistency with the
|
|
* server, so flush all of the file's data out of the cache.
|
|
* Then force a getattr rpc to ensure that you have up to date
|
|
* attributes.
|
|
* NB: This implies that cache data can be read when up to
|
|
* NFS_ATTRTIMEO seconds out of date. If you find that you need current
|
|
* attributes this could be forced by setting n_attrstamp to 0 before
|
|
* the VOP_GETATTR() call.
|
|
*/
|
|
if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
|
|
if (np->n_flag & NMODIFIED) {
|
|
if (vp->v_type != VREG) {
|
|
if (vp->v_type != VDIR)
|
|
panic("nfs: bioread, not dir");
|
|
nfs_invaldir(vp);
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
}
|
|
np->n_attrstamp = 0;
|
|
error = VOP_GETATTR(vp, &vattr, cred, p);
|
|
if (error)
|
|
return (error);
|
|
np->n_mtime = vattr.va_mtime.tv_sec;
|
|
} else {
|
|
error = VOP_GETATTR(vp, &vattr, cred, p);
|
|
if (error)
|
|
return (error);
|
|
if (np->n_mtime != vattr.va_mtime.tv_sec) {
|
|
if (vp->v_type == VDIR)
|
|
nfs_invaldir(vp);
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
np->n_mtime = vattr.va_mtime.tv_sec;
|
|
}
|
|
}
|
|
}
|
|
do {
|
|
|
|
/*
|
|
* Get a valid lease. If cached data is stale, flush it.
|
|
*/
|
|
if (nmp->nm_flag & NFSMNT_NQNFS) {
|
|
if (NQNFS_CKINVALID(vp, np, ND_READ)) {
|
|
do {
|
|
error = nqnfs_getlease(vp, ND_READ, cred, p);
|
|
} while (error == NQNFS_EXPIRED);
|
|
if (error)
|
|
return (error);
|
|
if (np->n_lrev != np->n_brev ||
|
|
(np->n_flag & NQNFSNONCACHE) ||
|
|
((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
|
|
if (vp->v_type == VDIR)
|
|
nfs_invaldir(vp);
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
np->n_brev = np->n_lrev;
|
|
}
|
|
} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
|
|
nfs_invaldir(vp);
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
}
|
|
}
|
|
if (np->n_flag & NQNFSNONCACHE) {
|
|
switch (vp->v_type) {
|
|
case VREG:
|
|
return (nfs_readrpc(vp, uio, cred));
|
|
case VLNK:
|
|
return (nfs_readlinkrpc(vp, uio, cred));
|
|
case VDIR:
|
|
break;
|
|
default:
|
|
printf(" NQNFSNONCACHE: type %x unexpected\n",
|
|
vp->v_type);
|
|
};
|
|
}
|
|
switch (vp->v_type) {
|
|
case VREG:
|
|
nfsstats.biocache_reads++;
|
|
lbn = uio->uio_offset / biosize;
|
|
on = uio->uio_offset & (biosize - 1);
|
|
not_readin = 1;
|
|
|
|
/*
|
|
* Start the read ahead(s), as required.
|
|
*/
|
|
if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
|
|
for (nra = 0; nra < nmp->nm_readahead &&
|
|
(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
|
|
rabn = lbn + 1 + nra;
|
|
if (!incore(vp, rabn)) {
|
|
rabp = nfs_getcacheblk(vp, rabn, biosize, p);
|
|
if (!rabp)
|
|
return (EINTR);
|
|
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
|
|
rabp->b_flags |= (B_READ | B_ASYNC);
|
|
vfs_busy_pages(rabp, 0);
|
|
if (nfs_asyncio(rabp, cred)) {
|
|
rabp->b_flags |= B_INVAL|B_ERROR;
|
|
vfs_unbusy_pages(rabp);
|
|
brelse(rabp);
|
|
}
|
|
} else {
|
|
brelse(rabp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If the block is in the cache and has the required data
|
|
* in a valid region, just copy it out.
|
|
* Otherwise, get the block and write back/read in,
|
|
* as required.
|
|
*/
|
|
again:
|
|
bufsize = biosize;
|
|
if ((off_t)(lbn + 1) * biosize > np->n_size &&
|
|
(off_t)(lbn + 1) * biosize - np->n_size < biosize) {
|
|
bufsize = np->n_size - lbn * biosize;
|
|
bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
|
|
}
|
|
bp = nfs_getcacheblk(vp, lbn, bufsize, p);
|
|
if (!bp)
|
|
return (EINTR);
|
|
if ((bp->b_flags & B_CACHE) == 0) {
|
|
bp->b_flags |= B_READ;
|
|
bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
|
|
not_readin = 0;
|
|
vfs_busy_pages(bp, 0);
|
|
error = nfs_doio(bp, cred, p);
|
|
if (error) {
|
|
brelse(bp);
|
|
return (error);
|
|
}
|
|
}
|
|
if (bufsize > on) {
|
|
n = min((unsigned)(bufsize - on), uio->uio_resid);
|
|
} else {
|
|
n = 0;
|
|
}
|
|
diff = np->n_size - uio->uio_offset;
|
|
if (diff < n)
|
|
n = diff;
|
|
if (not_readin && n > 0) {
|
|
if (on < bp->b_validoff || (on + n) > bp->b_validend) {
|
|
bp->b_flags |= B_NOCACHE;
|
|
if (bp->b_dirtyend > 0) {
|
|
if ((bp->b_flags & B_DELWRI) == 0)
|
|
panic("nfsbioread");
|
|
if (VOP_BWRITE(bp) == EINTR)
|
|
return (EINTR);
|
|
} else
|
|
brelse(bp);
|
|
goto again;
|
|
}
|
|
}
|
|
vp->v_lastr = lbn;
|
|
diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
|
|
if (diff < n)
|
|
n = diff;
|
|
break;
|
|
case VLNK:
|
|
nfsstats.biocache_readlinks++;
|
|
bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
|
|
if (!bp)
|
|
return (EINTR);
|
|
if ((bp->b_flags & B_CACHE) == 0) {
|
|
bp->b_flags |= B_READ;
|
|
vfs_busy_pages(bp, 0);
|
|
error = nfs_doio(bp, cred, p);
|
|
if (error) {
|
|
bp->b_flags |= B_ERROR;
|
|
brelse(bp);
|
|
return (error);
|
|
}
|
|
}
|
|
n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
|
|
on = 0;
|
|
break;
|
|
case VDIR:
|
|
nfsstats.biocache_readdirs++;
|
|
lbn = uio->uio_offset / NFS_DIRBLKSIZ;
|
|
on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
|
|
bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
|
|
if (!bp)
|
|
return (EINTR);
|
|
if ((bp->b_flags & B_CACHE) == 0) {
|
|
bp->b_flags |= B_READ;
|
|
vfs_busy_pages(bp, 0);
|
|
error = nfs_doio(bp, cred, p);
|
|
if (error) {
|
|
brelse(bp);
|
|
while (error == NFSERR_BAD_COOKIE) {
|
|
nfs_invaldir(vp);
|
|
error = nfs_vinvalbuf(vp, 0, cred, p, 1);
|
|
/*
|
|
* Yuck! The directory has been modified on the
|
|
* server. The only way to get the block is by
|
|
* reading from the beginning to get all the
|
|
* offset cookies.
|
|
*/
|
|
for (i = 0; i <= lbn && !error; i++) {
|
|
bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
|
|
if (!bp)
|
|
return (EINTR);
|
|
if ((bp->b_flags & B_DONE) == 0) {
|
|
bp->b_flags |= B_READ;
|
|
vfs_busy_pages(bp, 0);
|
|
error = nfs_doio(bp, cred, p);
|
|
if (error)
|
|
brelse(bp);
|
|
}
|
|
}
|
|
}
|
|
if (error)
|
|
return (error);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If not eof and read aheads are enabled, start one.
|
|
* (You need the current block first, so that you have the
|
|
* directory offset cookie of the next block.)
|
|
*/
|
|
if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
|
|
(np->n_direofoffset == 0 ||
|
|
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
|
|
!(np->n_flag & NQNFSNONCACHE) &&
|
|
!incore(vp, lbn + 1)) {
|
|
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
|
|
if (rabp) {
|
|
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
|
|
rabp->b_flags |= (B_READ | B_ASYNC);
|
|
vfs_busy_pages(rabp, 0);
|
|
if (nfs_asyncio(rabp, cred)) {
|
|
rabp->b_flags |= B_INVAL|B_ERROR;
|
|
vfs_unbusy_pages(rabp);
|
|
brelse(rabp);
|
|
}
|
|
} else {
|
|
brelse(rabp);
|
|
}
|
|
}
|
|
}
|
|
n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
|
|
break;
|
|
default:
|
|
printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
|
|
break;
|
|
};
|
|
|
|
if (n > 0) {
|
|
error = uiomove(bp->b_data + on, (int)n, uio);
|
|
}
|
|
switch (vp->v_type) {
|
|
case VREG:
|
|
break;
|
|
case VLNK:
|
|
n = 0;
|
|
break;
|
|
case VDIR:
|
|
if (np->n_flag & NQNFSNONCACHE)
|
|
bp->b_flags |= B_INVAL;
|
|
break;
|
|
default:
|
|
printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
|
|
}
|
|
brelse(bp);
|
|
} while (error == 0 && uio->uio_resid > 0 && n > 0);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Vnode op for write using bio
|
|
*/
|
|
int
|
|
nfs_write(ap)
|
|
struct vop_write_args /* {
|
|
struct vnode *a_vp;
|
|
struct uio *a_uio;
|
|
int a_ioflag;
|
|
struct ucred *a_cred;
|
|
} */ *ap;
|
|
{
|
|
register int biosize;
|
|
register struct uio *uio = ap->a_uio;
|
|
struct proc *p = uio->uio_procp;
|
|
register struct vnode *vp = ap->a_vp;
|
|
struct nfsnode *np = VTONFS(vp);
|
|
register struct ucred *cred = ap->a_cred;
|
|
int ioflag = ap->a_ioflag;
|
|
struct buf *bp;
|
|
struct vattr vattr;
|
|
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
|
|
daddr_t lbn;
|
|
int bufsize;
|
|
int n, on, error = 0, iomode, must_commit;
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (uio->uio_rw != UIO_WRITE)
|
|
panic("nfs_write mode");
|
|
if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
|
|
panic("nfs_write proc");
|
|
#endif
|
|
if (vp->v_type != VREG)
|
|
return (EIO);
|
|
if (np->n_flag & NWRITEERR) {
|
|
np->n_flag &= ~NWRITEERR;
|
|
return (np->n_error);
|
|
}
|
|
if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
|
|
(void)nfs_fsinfo(nmp, vp, cred, p);
|
|
if (ioflag & (IO_APPEND | IO_SYNC)) {
|
|
if (np->n_flag & NMODIFIED) {
|
|
np->n_attrstamp = 0;
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
}
|
|
if (ioflag & IO_APPEND) {
|
|
np->n_attrstamp = 0;
|
|
error = VOP_GETATTR(vp, &vattr, cred, p);
|
|
if (error)
|
|
return (error);
|
|
uio->uio_offset = np->n_size;
|
|
}
|
|
}
|
|
if (uio->uio_offset < 0)
|
|
return (EINVAL);
|
|
if (uio->uio_resid == 0)
|
|
return (0);
|
|
/*
|
|
* Maybe this should be above the vnode op call, but so long as
|
|
* file servers have no limits, i don't think it matters
|
|
*/
|
|
if (p && uio->uio_offset + uio->uio_resid >
|
|
p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
|
|
psignal(p, SIGXFSZ);
|
|
return (EFBIG);
|
|
}
|
|
/*
|
|
* I use nm_rsize, not nm_wsize so that all buffer cache blocks
|
|
* will be the same size within a filesystem. nfs_writerpc will
|
|
* still use nm_wsize when sizing the rpc's.
|
|
*/
|
|
biosize = vp->v_mount->mnt_stat.f_iosize;
|
|
do {
|
|
|
|
/*
|
|
* XXX make sure we aren't cached in the VM page cache
|
|
*/
|
|
/*
|
|
* Check for a valid write lease.
|
|
*/
|
|
if ((nmp->nm_flag & NFSMNT_NQNFS) &&
|
|
NQNFS_CKINVALID(vp, np, ND_WRITE)) {
|
|
do {
|
|
error = nqnfs_getlease(vp, ND_WRITE, cred, p);
|
|
} while (error == NQNFS_EXPIRED);
|
|
if (error)
|
|
return (error);
|
|
if (np->n_lrev != np->n_brev ||
|
|
(np->n_flag & NQNFSNONCACHE)) {
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
np->n_brev = np->n_lrev;
|
|
}
|
|
}
|
|
if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
|
|
iomode = NFSV3WRITE_FILESYNC;
|
|
error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
|
|
if (must_commit)
|
|
nfs_clearcommit(vp->v_mount);
|
|
return (error);
|
|
}
|
|
nfsstats.biocache_writes++;
|
|
lbn = uio->uio_offset / biosize;
|
|
on = uio->uio_offset & (biosize-1);
|
|
n = min((unsigned)(biosize - on), uio->uio_resid);
|
|
again:
|
|
if (uio->uio_offset + n > np->n_size) {
|
|
np->n_size = uio->uio_offset + n;
|
|
vnode_pager_setsize(vp, (u_long)np->n_size);
|
|
}
|
|
bufsize = biosize;
|
|
if ((lbn + 1) * biosize > np->n_size) {
|
|
bufsize = np->n_size - lbn * biosize;
|
|
bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
|
|
}
|
|
bp = nfs_getcacheblk(vp, lbn, bufsize, p);
|
|
if (!bp)
|
|
return (EINTR);
|
|
if (bp->b_wcred == NOCRED) {
|
|
crhold(cred);
|
|
bp->b_wcred = cred;
|
|
}
|
|
np->n_flag |= NMODIFIED;
|
|
|
|
if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
|
|
bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
|
|
}
|
|
|
|
/*
|
|
* If the new write will leave a contiguous dirty
|
|
* area, just update the b_dirtyoff and b_dirtyend,
|
|
* otherwise force a write rpc of the old dirty area.
|
|
*/
|
|
if (bp->b_dirtyend > 0 &&
|
|
(on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
|
|
bp->b_proc = p;
|
|
if (VOP_BWRITE(bp) == EINTR)
|
|
return (EINTR);
|
|
goto again;
|
|
}
|
|
|
|
/*
|
|
* Check for valid write lease and get one as required.
|
|
* In case getblk() and/or bwrite() delayed us.
|
|
*/
|
|
if ((nmp->nm_flag & NFSMNT_NQNFS) &&
|
|
NQNFS_CKINVALID(vp, np, ND_WRITE)) {
|
|
do {
|
|
error = nqnfs_getlease(vp, ND_WRITE, cred, p);
|
|
} while (error == NQNFS_EXPIRED);
|
|
if (error) {
|
|
brelse(bp);
|
|
return (error);
|
|
}
|
|
if (np->n_lrev != np->n_brev ||
|
|
(np->n_flag & NQNFSNONCACHE)) {
|
|
brelse(bp);
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
np->n_brev = np->n_lrev;
|
|
goto again;
|
|
}
|
|
}
|
|
error = uiomove((char *)bp->b_data + on, n, uio);
|
|
if (error) {
|
|
bp->b_flags |= B_ERROR;
|
|
brelse(bp);
|
|
return (error);
|
|
}
|
|
if (bp->b_dirtyend > 0) {
|
|
bp->b_dirtyoff = min(on, bp->b_dirtyoff);
|
|
bp->b_dirtyend = max((on + n), bp->b_dirtyend);
|
|
} else {
|
|
bp->b_dirtyoff = on;
|
|
bp->b_dirtyend = on + n;
|
|
}
|
|
if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
|
|
bp->b_validoff > bp->b_dirtyend) {
|
|
bp->b_validoff = bp->b_dirtyoff;
|
|
bp->b_validend = bp->b_dirtyend;
|
|
} else {
|
|
bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
|
|
bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
|
|
}
|
|
|
|
/*
|
|
* Since this block is being modified, it must be written
|
|
* again and not just committed.
|
|
*/
|
|
bp->b_flags &= ~B_NEEDCOMMIT;
|
|
|
|
/*
|
|
* If the lease is non-cachable or IO_SYNC do bwrite().
|
|
*/
|
|
if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
|
|
bp->b_proc = p;
|
|
error = VOP_BWRITE(bp);
|
|
if (error)
|
|
return (error);
|
|
if (np->n_flag & NQNFSNONCACHE) {
|
|
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
|
|
if (error)
|
|
return (error);
|
|
}
|
|
} else if ((n + on) == biosize &&
|
|
(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
|
|
bp->b_proc = (struct proc *)0;
|
|
bp->b_flags |= B_ASYNC;
|
|
(void)nfs_writebp(bp, 0);
|
|
} else
|
|
bdwrite(bp);
|
|
} while (uio->uio_resid > 0 && n > 0);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Get an nfs cache block.
|
|
* Allocate a new one if the block isn't currently in the cache
|
|
* and return the block marked busy. If the calling process is
|
|
* interrupted by a signal for an interruptible mount point, return
|
|
* NULL.
|
|
*/
|
|
static struct buf *
|
|
nfs_getcacheblk(vp, bn, size, p)
|
|
struct vnode *vp;
|
|
daddr_t bn;
|
|
int size;
|
|
struct proc *p;
|
|
{
|
|
register struct buf *bp;
|
|
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
|
|
int biosize = vp->v_mount->mnt_stat.f_iosize;
|
|
|
|
if (nmp->nm_flag & NFSMNT_INT) {
|
|
bp = getblk(vp, bn, size, PCATCH, 0);
|
|
while (bp == (struct buf *)0) {
|
|
if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
|
|
return ((struct buf *)0);
|
|
bp = getblk(vp, bn, size, 0, 2 * hz);
|
|
}
|
|
} else
|
|
bp = getblk(vp, bn, size, 0, 0);
|
|
|
|
if( vp->v_type == VREG)
|
|
bp->b_blkno = (bn * biosize) / DEV_BSIZE;
|
|
|
|
return (bp);
|
|
}
|
|
|
|
/*
|
|
* Flush and invalidate all dirty buffers. If another process is already
|
|
* doing the flush, just wait for completion.
|
|
*/
|
|
int
|
|
nfs_vinvalbuf(vp, flags, cred, p, intrflg)
|
|
struct vnode *vp;
|
|
int flags;
|
|
struct ucred *cred;
|
|
struct proc *p;
|
|
int intrflg;
|
|
{
|
|
register struct nfsnode *np = VTONFS(vp);
|
|
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
|
|
int error = 0, slpflag, slptimeo;
|
|
|
|
if ((nmp->nm_flag & NFSMNT_INT) == 0)
|
|
intrflg = 0;
|
|
if (intrflg) {
|
|
slpflag = PCATCH;
|
|
slptimeo = 2 * hz;
|
|
} else {
|
|
slpflag = 0;
|
|
slptimeo = 0;
|
|
}
|
|
/*
|
|
* First wait for any other process doing a flush to complete.
|
|
*/
|
|
while (np->n_flag & NFLUSHINPROG) {
|
|
np->n_flag |= NFLUSHWANT;
|
|
error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
|
|
slptimeo);
|
|
if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
|
|
return (EINTR);
|
|
}
|
|
|
|
/*
|
|
* Now, flush as required.
|
|
*/
|
|
np->n_flag |= NFLUSHINPROG;
|
|
error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
|
|
while (error) {
|
|
if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
|
|
np->n_flag &= ~NFLUSHINPROG;
|
|
if (np->n_flag & NFLUSHWANT) {
|
|
np->n_flag &= ~NFLUSHWANT;
|
|
wakeup((caddr_t)&np->n_flag);
|
|
}
|
|
return (EINTR);
|
|
}
|
|
error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
|
|
}
|
|
np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
|
|
if (np->n_flag & NFLUSHWANT) {
|
|
np->n_flag &= ~NFLUSHWANT;
|
|
wakeup((caddr_t)&np->n_flag);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Initiate asynchronous I/O. Return an error if no nfsiods are available.
|
|
* This is mainly to avoid queueing async I/O requests when the nfsiods
|
|
* are all hung on a dead server.
|
|
*/
|
|
int
|
|
nfs_asyncio(bp, cred)
|
|
register struct buf *bp;
|
|
struct ucred *cred;
|
|
{
|
|
register int i;
|
|
|
|
if (nfs_numasync == 0)
|
|
return (EIO);
|
|
for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
|
|
if (nfs_iodwant[i]) {
|
|
if (bp->b_flags & B_READ) {
|
|
if (bp->b_rcred == NOCRED && cred != NOCRED) {
|
|
crhold(cred);
|
|
bp->b_rcred = cred;
|
|
}
|
|
} else {
|
|
bp->b_flags |= B_WRITEINPROG;
|
|
if (bp->b_wcred == NOCRED && cred != NOCRED) {
|
|
crhold(cred);
|
|
bp->b_wcred = cred;
|
|
}
|
|
}
|
|
|
|
TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
|
|
nfs_iodwant[i] = (struct proc *)0;
|
|
wakeup((caddr_t)&nfs_iodwant[i]);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
|
|
* return EIO so the process will call nfs_doio() and do it
|
|
* synchronously.
|
|
*/
|
|
if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
|
|
return (EIO);
|
|
|
|
/*
|
|
* Allow the administrator to override the choice of using a delayed
|
|
* write since it is a pessimization for some servers, notably some
|
|
* Solaris servers.
|
|
*/
|
|
if (!nfs_dwrite)
|
|
return (EIO);
|
|
|
|
/*
|
|
* Just turn the async write into a delayed write, instead of
|
|
* doing in synchronously. Hopefully, at least one of the nfsiods
|
|
* is currently doing a write for this file and will pick up the
|
|
* delayed writes before going back to sleep.
|
|
*/
|
|
bp->b_flags |= B_DELWRI;
|
|
reassignbuf(bp, bp->b_vp);
|
|
biodone(bp);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Do an I/O operation to/from a cache block. This may be called
|
|
* synchronously or from an nfsiod.
|
|
*/
|
|
int
|
|
nfs_doio(bp, cr, p)
|
|
register struct buf *bp;
|
|
struct ucred *cr;
|
|
struct proc *p;
|
|
{
|
|
register struct uio *uiop;
|
|
register struct vnode *vp;
|
|
struct nfsnode *np;
|
|
struct nfsmount *nmp;
|
|
int error = 0, diff, len, iomode, must_commit = 0;
|
|
struct uio uio;
|
|
struct iovec io;
|
|
|
|
vp = bp->b_vp;
|
|
np = VTONFS(vp);
|
|
nmp = VFSTONFS(vp->v_mount);
|
|
uiop = &uio;
|
|
uiop->uio_iov = &io;
|
|
uiop->uio_iovcnt = 1;
|
|
uiop->uio_segflg = UIO_SYSSPACE;
|
|
uiop->uio_procp = p;
|
|
|
|
/*
|
|
* Historically, paging was done with physio, but no more.
|
|
*/
|
|
if (bp->b_flags & B_PHYS) {
|
|
/*
|
|
* ...though reading /dev/drum still gets us here.
|
|
*/
|
|
io.iov_len = uiop->uio_resid = bp->b_bcount;
|
|
/* mapping was done by vmapbuf() */
|
|
io.iov_base = bp->b_data;
|
|
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
|
|
if (bp->b_flags & B_READ) {
|
|
uiop->uio_rw = UIO_READ;
|
|
nfsstats.read_physios++;
|
|
error = nfs_readrpc(vp, uiop, cr);
|
|
} else {
|
|
int com;
|
|
|
|
iomode = NFSV3WRITE_DATASYNC;
|
|
uiop->uio_rw = UIO_WRITE;
|
|
nfsstats.write_physios++;
|
|
error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
|
|
}
|
|
if (error) {
|
|
bp->b_flags |= B_ERROR;
|
|
bp->b_error = error;
|
|
}
|
|
} else if (bp->b_flags & B_READ) {
|
|
io.iov_len = uiop->uio_resid = bp->b_bcount;
|
|
io.iov_base = bp->b_data;
|
|
uiop->uio_rw = UIO_READ;
|
|
switch (vp->v_type) {
|
|
case VREG:
|
|
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
|
|
nfsstats.read_bios++;
|
|
error = nfs_readrpc(vp, uiop, cr);
|
|
if (!error) {
|
|
bp->b_validoff = 0;
|
|
if (uiop->uio_resid) {
|
|
/*
|
|
* If len > 0, there is a hole in the file and
|
|
* no writes after the hole have been pushed to
|
|
* the server yet.
|
|
* Just zero fill the rest of the valid area.
|
|
*/
|
|
diff = bp->b_bcount - uiop->uio_resid;
|
|
len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
|
|
+ diff);
|
|
if (len > 0) {
|
|
len = min(len, uiop->uio_resid);
|
|
bzero((char *)bp->b_data + diff, len);
|
|
bp->b_validend = diff + len;
|
|
} else
|
|
bp->b_validend = diff;
|
|
} else
|
|
bp->b_validend = bp->b_bcount;
|
|
}
|
|
if (p && (vp->v_flag & VTEXT) &&
|
|
(((nmp->nm_flag & NFSMNT_NQNFS) &&
|
|
NQNFS_CKINVALID(vp, np, ND_READ) &&
|
|
np->n_lrev != np->n_brev) ||
|
|
(!(nmp->nm_flag & NFSMNT_NQNFS) &&
|
|
np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
|
|
uprintf("Process killed due to text file modification\n");
|
|
psignal(p, SIGKILL);
|
|
#ifdef __NetBSD__
|
|
p->p_holdcnt++;
|
|
#else
|
|
p->p_flag |= P_NOSWAP;
|
|
#endif
|
|
}
|
|
break;
|
|
case VLNK:
|
|
uiop->uio_offset = (off_t)0;
|
|
nfsstats.readlink_bios++;
|
|
error = nfs_readlinkrpc(vp, uiop, cr);
|
|
break;
|
|
case VDIR:
|
|
nfsstats.readdir_bios++;
|
|
uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
|
|
if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
|
|
error = nfs_readdirplusrpc(vp, uiop, cr);
|
|
if (error == NFSERR_NOTSUPP)
|
|
nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
|
|
}
|
|
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
|
|
error = nfs_readdirrpc(vp, uiop, cr);
|
|
break;
|
|
default:
|
|
printf("nfs_doio: type %x unexpected\n",vp->v_type);
|
|
break;
|
|
};
|
|
if (error) {
|
|
bp->b_flags |= B_ERROR;
|
|
bp->b_error = error;
|
|
}
|
|
} else {
|
|
if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
|
|
bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
|
|
|
|
if (bp->b_dirtyend > bp->b_dirtyoff) {
|
|
io.iov_len = uiop->uio_resid = bp->b_dirtyend
|
|
- bp->b_dirtyoff;
|
|
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
|
|
+ bp->b_dirtyoff;
|
|
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
|
|
uiop->uio_rw = UIO_WRITE;
|
|
nfsstats.write_bios++;
|
|
if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
|
|
iomode = NFSV3WRITE_UNSTABLE;
|
|
else
|
|
iomode = NFSV3WRITE_FILESYNC;
|
|
bp->b_flags |= B_WRITEINPROG;
|
|
error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
|
|
if (!error && iomode == NFSV3WRITE_UNSTABLE)
|
|
bp->b_flags |= B_NEEDCOMMIT;
|
|
else
|
|
bp->b_flags &= ~B_NEEDCOMMIT;
|
|
bp->b_flags &= ~B_WRITEINPROG;
|
|
|
|
/*
|
|
* For an interrupted write, the buffer is still valid
|
|
* and the write hasn't been pushed to the server yet,
|
|
* so we can't set B_ERROR and report the interruption
|
|
* by setting B_EINTR. For the B_ASYNC case, B_EINTR
|
|
* is not relevant, so the rpc attempt is essentially
|
|
* a noop. For the case of a V3 write rpc not being
|
|
* committed to stable storage, the block is still
|
|
* dirty and requires either a commit rpc or another
|
|
* write rpc with iomode == NFSV3WRITE_FILESYNC before
|
|
* the block is reused. This is indicated by setting
|
|
* the B_DELWRI and B_NEEDCOMMIT flags.
|
|
*/
|
|
if (error == EINTR
|
|
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
|
|
bp->b_flags &= ~(B_INVAL|B_NOCACHE);
|
|
bp->b_flags |= B_DELWRI;
|
|
|
|
/*
|
|
* Since for the B_ASYNC case, nfs_bwrite() has reassigned the
|
|
* buffer to the clean list, we have to reassign it back to the
|
|
* dirty one. Ugh.
|
|
*/
|
|
if (bp->b_flags & B_ASYNC)
|
|
reassignbuf(bp, vp);
|
|
else
|
|
bp->b_flags |= B_EINTR;
|
|
} else {
|
|
if (error) {
|
|
bp->b_flags |= B_ERROR;
|
|
bp->b_error = np->n_error = error;
|
|
np->n_flag |= NWRITEERR;
|
|
}
|
|
bp->b_dirtyoff = bp->b_dirtyend = 0;
|
|
}
|
|
} else {
|
|
bp->b_resid = 0;
|
|
biodone(bp);
|
|
return (0);
|
|
}
|
|
}
|
|
bp->b_resid = uiop->uio_resid;
|
|
if (must_commit)
|
|
nfs_clearcommit(vp->v_mount);
|
|
biodone(bp);
|
|
return (error);
|
|
}
|