mirror of
https://git.FreeBSD.org/src.git
synced 2024-12-16 10:20:30 +00:00
Prevent large files from monopolizing the system buffers. Keep
track of the number of dirty buffers held by a vnode. When a bdwrite is done on a buffer, check the existing number of dirty buffers associated with its vnode. If the number rises above vfs.dirtybufthresh (currently 90% of vfs.hidirtybuffers), one of the other (hopefully older) dirty buffers associated with the vnode is written (using bawrite). In the event that this approach fails to curb the growth in it the vnode's number of dirty buffers (due to soft updates rollback dependencies), the more drastic approach of doing a VOP_FSYNC on the vnode is used. This code primarily affects very large and actively written files such as snapshots. This change should eliminate hanging when taking snapshots or doing background fsck on very large filesystems. Hopefully, one day it will be possible to cache filesystem metadata in the VM cache as is done with file data. As it stands, only the buffer cache can be used which limits total metadata storage to about 20Mb no matter how much memory is available on the system. This rather small memory gets badly thrashed causing a lot of extra I/O. For example, taking a snapshot of a 1Tb filesystem minimally requires about 35,000 write operations, but because of the cache thrashing (we only have about 350 buffers at our disposal) ends up doing about 237,540 I/O's thus taking twenty-five minutes instead of four if it could run entirely in the cache. Reported by: Attila Nagy <bra@fsn.hu> Sponsored by: DARPA & NAI Labs.
This commit is contained in:
parent
d4b570f053
commit
3a7053cb60
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=111466
@ -124,6 +124,12 @@ SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
|
||||
static int hirunningspace;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
|
||||
"Maximum amount of space to use for in-progress I/O");
|
||||
static int dirtybufferflushes;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
|
||||
0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
|
||||
static int altbufferflushes;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
|
||||
0, "Number of fsync flushes to limit dirty buffers");
|
||||
static int numdirtybuffers;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
|
||||
"Number of buffers that are dirty (has unwritten changes) at the moment");
|
||||
@ -133,6 +139,9 @@ SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
|
||||
static int hidirtybuffers;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
|
||||
"When the number of dirty buffers is considered severe");
|
||||
static int dirtybufthresh;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
|
||||
0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
|
||||
static int numfreebuffers;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
|
||||
"Number of free buffers");
|
||||
@ -584,6 +593,7 @@ bufinit(void)
|
||||
* of delayed-write dirty buffers we allow to stack up.
|
||||
*/
|
||||
hidirtybuffers = nbuf / 4 + 20;
|
||||
dirtybufthresh = hidirtybuffers * 9 / 10;
|
||||
numdirtybuffers = 0;
|
||||
/*
|
||||
* To support extreme low-memory systems, make sure hidirtybuffers cannot
|
||||
@ -993,6 +1003,10 @@ vfs_backgroundwritedone(bp)
|
||||
void
|
||||
bdwrite(struct buf * bp)
|
||||
{
|
||||
struct thread *td = curthread;
|
||||
struct vnode *vp;
|
||||
struct buf *nbp;
|
||||
|
||||
GIANT_REQUIRED;
|
||||
|
||||
if (BUF_REFCNT(bp) == 0)
|
||||
@ -1002,8 +1016,47 @@ bdwrite(struct buf * bp)
|
||||
brelse(bp);
|
||||
return;
|
||||
}
|
||||
bdirty(bp);
|
||||
|
||||
/*
|
||||
* If we have too many dirty buffers, don't create any more.
|
||||
* If we are wildly over our limit, then force a complete
|
||||
* cleanup. Otherwise, just keep the situation from getting
|
||||
* out of control.
|
||||
*/
|
||||
vp = bp->b_vp;
|
||||
VI_LOCK(vp);
|
||||
if (vp != NULL && vp->v_dirtybufcnt > dirtybufthresh + 10) {
|
||||
VI_UNLOCK(vp);
|
||||
(void) VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td);
|
||||
VI_LOCK(vp);
|
||||
altbufferflushes++;
|
||||
} else if (vp != NULL && vp->v_dirtybufcnt > dirtybufthresh) {
|
||||
/*
|
||||
* Try to find a buffer to flush.
|
||||
*/
|
||||
TAILQ_FOREACH(nbp, &vp->v_dirtyblkhd, b_vnbufs) {
|
||||
if ((nbp->b_xflags & BX_BKGRDINPROG) ||
|
||||
buf_countdeps(nbp, 0) ||
|
||||
BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT))
|
||||
continue;
|
||||
if (bp == nbp)
|
||||
panic("bdwrite: found ourselves");
|
||||
VI_UNLOCK(vp);
|
||||
if (nbp->b_flags & B_CLUSTEROK) {
|
||||
BUF_UNLOCK(nbp);
|
||||
vfs_bio_awrite(nbp);
|
||||
} else {
|
||||
bremfree(nbp);
|
||||
bawrite(nbp);
|
||||
}
|
||||
VI_LOCK(vp);
|
||||
dirtybufferflushes++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
VI_UNLOCK(vp);
|
||||
|
||||
bdirty(bp);
|
||||
/*
|
||||
* Set B_CACHE, indicating that the buffer is fully valid. This is
|
||||
* true even of NFS now.
|
||||
@ -1019,8 +1072,8 @@ bdwrite(struct buf * bp)
|
||||
* requesting a sync -- there might not be enough memory to do
|
||||
* the bmap then... So, this is important to do.
|
||||
*/
|
||||
if (bp->b_vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
|
||||
VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
|
||||
if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
|
||||
VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -990,7 +990,9 @@ getnewvnode(tag, mp, vops, vpp)
|
||||
vp->v_socket = 0;
|
||||
lockdestroy(vp->v_vnlock);
|
||||
lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
|
||||
KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
|
||||
KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
|
||||
KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
|
||||
KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
|
||||
} else {
|
||||
numvnodes++;
|
||||
@ -1470,6 +1472,7 @@ buf_vlist_remove(struct buf *bp)
|
||||
}
|
||||
vp->v_dirtyblkroot = root;
|
||||
TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
|
||||
vp->v_dirtybufcnt--;
|
||||
} else {
|
||||
/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
|
||||
if (bp != vp->v_cleanblkroot) {
|
||||
@ -1484,6 +1487,7 @@ buf_vlist_remove(struct buf *bp)
|
||||
}
|
||||
vp->v_cleanblkroot = root;
|
||||
TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
|
||||
vp->v_cleanbufcnt--;
|
||||
}
|
||||
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
|
||||
}
|
||||
@ -1522,6 +1526,7 @@ buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
|
||||
TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
|
||||
root, bp, b_vnbufs);
|
||||
}
|
||||
vp->v_dirtybufcnt++;
|
||||
vp->v_dirtyblkroot = bp;
|
||||
} else {
|
||||
/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
|
||||
@ -1544,6 +1549,7 @@ buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
|
||||
TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
|
||||
root, bp, b_vnbufs);
|
||||
}
|
||||
vp->v_cleanbufcnt++;
|
||||
vp->v_cleanblkroot = bp;
|
||||
}
|
||||
}
|
||||
|
@ -108,8 +108,10 @@ struct vnode {
|
||||
int v_holdcnt; /* i page & buffer references */
|
||||
struct buflists v_cleanblkhd; /* i SORTED clean blocklist */
|
||||
struct buf *v_cleanblkroot; /* i clean buf splay tree */
|
||||
int v_cleanbufcnt; /* i number of clean buffers */
|
||||
struct buflists v_dirtyblkhd; /* i SORTED dirty blocklist */
|
||||
struct buf *v_dirtyblkroot; /* i dirty buf splay tree */
|
||||
int v_dirtybufcnt; /* i number of dirty buffers */
|
||||
u_long v_vflag; /* v vnode flags */
|
||||
int v_writecount; /* v ref count of writers */
|
||||
struct vm_object *v_object; /* v Place to store VM object */
|
||||
|
Loading…
Reference in New Issue
Block a user