1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-22 11:17:19 +00:00

An inode block must not be blockingly read while cg block is owned.

The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.

Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block.  The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.

In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it.  After the write is finished, reread cg block and
update the cg_initediblk.

If inode block is already locked by another thread, let the another
thread initialize it.  If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk.  Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.

Sponsored by:	The FreeBSD Foundation
In collaboration with:	pho
Reviewed by:	mckusick
MFC after:	1 month
X-MFC-note:	after r246877
This commit is contained in:
Konstantin Belousov 2013-02-27 07:31:23 +00:00
parent e63b930908
commit 94f4ac214c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=247387

View File

@ -1790,6 +1790,17 @@ ffs_clusteralloc(ip, cg, bpref, len, unused)
return (0);
}
static inline struct buf *
getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
{
struct fs *fs;
fs = ip->i_fs;
return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs,
cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
gbflags));
}
/*
* Determine whether an inode can be allocated.
*
@ -1814,9 +1825,11 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
u_int8_t *inosused, *loc;
struct ufs2_dinode *dp2;
int error, start, len, i;
u_int32_t old_initediblk;
fs = ip->i_fs;
ump = ip->i_ump;
check_nifree:
if (fs->fs_cs(fs, cg).cs_nifree == 0)
return (0);
UFS_UNLOCK(ump);
@ -1828,13 +1841,13 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
return (0);
}
cgp = (struct cg *)bp->b_data;
restart:
if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
brelse(bp);
UFS_LOCK(ump);
return (0);
}
bp->b_xflags |= BX_BKGRDWRITE;
cgp->cg_old_time = cgp->cg_time = time_second;
inosused = cg_inosused(cgp);
if (ipref) {
ipref %= fs->fs_ipg;
@ -1856,7 +1869,6 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
}
}
ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
cgp->cg_irotor = ipref;
gotit:
/*
* Check to see if we need to initialize more inodes.
@ -1864,9 +1876,37 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
if (fs->fs_magic == FS_UFS2_MAGIC &&
ipref + INOPB(fs) > cgp->cg_initediblk &&
cgp->cg_initediblk < cgp->cg_niblk) {
ibp = getblk(ip->i_devvp, fsbtodb(fs,
ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
(int)fs->fs_bsize, 0, 0, 0);
old_initediblk = cgp->cg_initediblk;
/*
* Free the cylinder group lock before writing the
* initialized inode block. Entering the
* babarrierwrite() with the cylinder group lock
* causes lock order violation between the lock and
* snaplk.
*
* Another thread can decide to initialize the same
* inode block, but whichever thread first gets the
* cylinder group lock after writing the newly
* allocated inode block will update it and the other
* will realize that it has lost and leave the
* cylinder group unchanged.
*/
ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
brelse(bp);
if (ibp == NULL) {
/*
* The inode block buffer is already owned by
* another thread, which must initialize it.
* Wait on the buffer to allow another thread
* to finish the updates, with dropped cg
* buffer lock, then retry.
*/
ibp = getinobuf(ip, cg, old_initediblk, 0);
brelse(ibp);
UFS_LOCK(ump);
goto check_nifree;
}
bzero(ibp->b_data, (int)fs->fs_bsize);
dp2 = (struct ufs2_dinode *)(ibp->b_data);
for (i = 0; i < INOPB(fs); i++) {
@ -1883,8 +1923,29 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
* loading of newly created filesystems.
*/
babarrierwrite(ibp);
cgp->cg_initediblk += INOPB(fs);
/*
* After the inode block is written, try to update the
* cg initediblk pointer. If another thread beat us
* to it, then leave it unchanged as the other thread
* has already set it correctly.
*/
error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, NOCRED, &bp);
UFS_LOCK(ump);
ACTIVECLEAR(fs, cg);
UFS_UNLOCK(ump);
if (error != 0) {
brelse(bp);
return (error);
}
cgp = (struct cg *)bp->b_data;
if (cgp->cg_initediblk == old_initediblk)
cgp->cg_initediblk += INOPB(fs);
goto restart;
}
cgp->cg_old_time = cgp->cg_time = time_second;
cgp->cg_irotor = ipref;
UFS_LOCK(ump);
ACTIVECLEAR(fs, cg);
setbit(inosused, ipref);