diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 98da01a373f..62407fee071 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -235,6 +235,15 @@ vn_close(vp, flags, cred, p) if (flags & FWRITE) vp->v_writecount--; error = VOP_CLOSE(vp, flags, cred, p); + /* + * XXX - In certain instances VOP_CLOSE has to do the vrele + * itself. If the vrele has been done, it will return EAGAIN + * to indicate that the vrele should not be done again. When + * this happens, we just return success. The correct thing to + * do would be to have all VOP_CLOSE instances do the vrele. + */ + if (error == EAGAIN) + return (0); vrele(vp); return (error); } diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index 2b091621311..0e1e68fc7de 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -198,10 +198,14 @@ restart: } /* * Allocate shadow blocks to copy all of the other snapshot inodes - * so that we will be able to expunge them from this snapshot. + * so that we will be able to expunge them from this snapshot. Also + * include a copy of ourselves so that we do not deadlock trying + * to copyonwrite ourselves when VOP_FSYNC'ing below. */ - for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { + fs->fs_snapinum[snaploc] = ip->i_number; + for (loc = snaploc, inoblkcnt = 0; loc >= 0; loc--) { blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); + fs->fs_snapinum[snaploc] = 0; for (i = 0; i < inoblkcnt; i++) if (inoblks[i] == blkno) break; @@ -652,14 +656,14 @@ ffs_snapremove(vp) ip = VTOI(vp); fs = ip->i_fs; /* - * Delete from incore list. + * If active, delete from incore list (this snapshot may + * already have been in the process of being deleted, so + * would not have been active). + * * Clear copy-on-write flag if last snapshot. */ - devvp = ip->i_devvp; - if (ip->i_nextsnap.tqe_prev == 0) { - printf("ffs_snapremove: lost snapshot vnode %d\n", - ip->i_number); - } else { + if (ip->i_nextsnap.tqe_prev != 0) { + devvp = ip->i_devvp; TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) { @@ -832,9 +836,10 @@ ffs_snapblkfree(freeip, bno, size) error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); p->p_flag &= ~P_COWINPROGRESS; - VOP_UNLOCK(vp, 0, p); - if (error) + if (error) { + VOP_UNLOCK(vp, 0, p); break; + } #ifdef DEBUG if (snapdebug) printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", @@ -843,22 +848,44 @@ ffs_snapblkfree(freeip, bno, size) #endif /* * If we have already read the old block contents, then - * simply copy them to the new block. + * simply copy them to the new block. Note that we need + * to synchronously write snapshots that have not been + * unlinked, and hence will be visible after a crash, + * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); + if (ip->i_effnlink > 0) + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); continue; } /* * Otherwise, read the old block contents into the buffer. */ - if ((error = readblock(cbp, lbn)) != 0) + if ((error = readblock(cbp, lbn)) != 0) { + bzero(cbp->b_data, fs->fs_bsize); + bawrite(cbp); + if (ip->i_effnlink > 0) + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); break; + } savedcbp = cbp; } - if (savedcbp) + /* + * Note that we need to synchronously write snapshots that + * have not been unlinked, and hence will be visible after + * a crash, to ensure their integrity. + */ + if (savedcbp) { + vp = savedcbp->b_vp; bawrite(savedcbp); + if (VTOI(vp)->i_effnlink > 0) + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); + } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will @@ -1014,8 +1041,8 @@ retry: error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); p->p_flag &= ~P_COWINPROGRESS; - VOP_UNLOCK(vp, 0, p); if (error) { + VOP_UNLOCK(vp, 0, p); if (error != EWOULDBLOCK) break; tsleep(vp, p->p_pri.pri_user, "nap", 1); @@ -1035,22 +1062,44 @@ retry: #endif /* * If we have already read the old block contents, then - * simply copy them to the new block. + * simply copy them to the new block. Note that we need + * to synchronously write snapshots that have not been + * unlinked, and hence will be visible after a crash, + * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); + if (ip->i_effnlink > 0) + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); continue; } /* * Otherwise, read the old block contents into the buffer. */ - if ((error = readblock(cbp, lbn)) != 0) + if ((error = readblock(cbp, lbn)) != 0) { + bzero(cbp->b_data, fs->fs_bsize); + bawrite(cbp); + if (ip->i_effnlink > 0) + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); break; + } savedcbp = cbp; } - if (savedcbp) + /* + * Note that we need to synchronously write snapshots that + * have not been unlinked, and hence will be visible after + * a crash, to ensure their integrity. + */ + if (savedcbp) { + vp = savedcbp->b_vp; bawrite(savedcbp); + if (VTOI(vp)->i_effnlink > 0) + (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); + } return (error); } diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 7d48fdb6e36..02c3eafaedb 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -952,7 +952,7 @@ ffs_sync(mp, waitfor, cred, p) struct ucred *cred; struct proc *p; { - struct vnode *nvp, *vp; + struct vnode *nvp, *vp, *devvp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; @@ -1026,12 +1026,21 @@ loop: #ifdef QUOTA qsync(mp); #endif - if (waitfor != MNT_LAZY) { - vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p); - if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0) + devvp = ump->um_devvp; + mtx_lock(&devvp->v_interlock); + if (waitfor != MNT_LAZY && + (devvp->v_numoutput > 0 || TAILQ_FIRST(&devvp->v_dirtyblkhd))) { + mtx_unlock(&devvp->v_interlock); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); + if ((error = VOP_FSYNC(devvp, cred, waitfor, p)) != 0) allerror = error; - VOP_UNLOCK(ump->um_devvp, 0, p); - } + VOP_UNLOCK(devvp, 0, p); + if (waitfor == MNT_WAIT) { + mtx_lock(&mntvnode_mtx); + goto loop; + } + } else + mtx_unlock(&devvp->v_interlock); /* * Write back modified superblock. */ diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 7c362630d89..ad4e0199b8d 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -292,12 +292,32 @@ ufs_close(ap) struct proc *a_p; } */ *ap; { - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; + struct mount *mp; mtx_lock(&vp->v_interlock); - if (vp->v_usecount > 1) + if (vp->v_usecount > 1) { ufs_itimes(vp); - mtx_unlock(&vp->v_interlock); + mtx_unlock(&vp->v_interlock); + } else { + mtx_unlock(&vp->v_interlock); + /* + * If we are closing the last reference to an unlinked + * file, then it will be freed by the inactive routine. + * Because the freeing causes a the filesystem to be + * modified, it must be held up during periods when the + * filesystem is suspended. + * + * XXX - EAGAIN is returned to prevent vn_close from + * repeating the vrele operation. + */ + if (vp->v_type == VREG && VTOI(vp)->i_effnlink == 0) { + (void) vn_start_write(vp, &mp, V_WAIT); + vrele(vp); + vn_finished_write(mp); + return (EAGAIN); + } + } return (0); }