From 68c49bcc40c0bf02dbedb27980910f7e3a306db1 Mon Sep 17 00:00:00 2001 From: Kirk McKusick Date: Mon, 6 Aug 2018 21:09:11 +0000 Subject: [PATCH] Put in place the framework for consolodating contiguous blocks into a smaller number of larger TRIM requests. The hope had been to have the full TRIM consolodation in place for 12.0, but the algorithms are still under development and need further testing. With this framework in place it will be possible to easily add TRIM consolodation once the optimal strategy has been found. The only functional change with this patch is the elimination of TRIM requests for blocks that are freed before they have been likely to have been written. Reviewed by: kib Discussed with: Warner Losh and Chuck Silvers Sponsored by: Netflix --- sys/ufs/ffs/ffs_alloc.c | 112 +++++++++++++++++++++++++++--------- sys/ufs/ffs/ffs_balloc.c | 4 +- sys/ufs/ffs/ffs_extern.h | 19 ++++++- sys/ufs/ffs/ffs_inode.c | 47 ++++++++++++--- sys/ufs/ffs/ffs_snapshot.c | 6 +- sys/ufs/ffs/ffs_softdep.c | 113 ++++++++++++++++++++++++++++--------- sys/ufs/ffs/ffs_vfsops.c | 3 + sys/ufs/ffs/softdep.h | 1 + sys/ufs/ufs/ufsmount.h | 4 ++ 9 files changed, 242 insertions(+), 67 deletions(-) diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 1e1b4f1350a1..3101f618b059 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -110,8 +110,6 @@ static ufs2_daddr_t static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); -static void ffs_blkfree_trim_completed(struct buf *); -static void ffs_blkfree_trim_task(void *ctx, int pending __unused); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif @@ -395,8 +393,23 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) + /* + * The usual case is that a smaller fragment that + * was just allocated has been replaced with a bigger + * fragment or a full-size block. If it is marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the block was written + * earlier, but very uncommon. If the block has never + * been written, there is no need to send a BIO_DELETE + * for it when it is freed. The gain from avoiding the + * TRIMs for the common case of unwritten blocks far + * exceeds the cost of the write amplification for the + * uncommon case of failing to send a TRIM for a block + * that had been written. + */ ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) @@ -521,7 +534,7 @@ ffs_reallocblks_ufs1(ap) struct fs *fs; struct inode *ip; struct vnode *vp; - struct buf *sbp, *ebp; + struct buf *sbp, *ebp, *bp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; @@ -730,14 +743,29 @@ ffs_reallocblks_ufs1(ap) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) + /* + * The usual case is that a set of N-contiguous blocks + * that was just allocated has been replaced with a + * set of N+1-contiguous blocks. If they are marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the blocks were written + * earlier, but very uncommon. If the blocks have never + * been written, there is no need to send a BIO_DELETE + * for them when they are freed. The gain from avoiding + * the TRIMs for the common case of unwritten blocks + * far exceeds the cost of the write amplification for + * the uncommon case of failing to send a TRIM for the + * blocks that had been written. + */ ffs_blkfree(ump, fs, ump->um_devvp, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number, vp->v_type, NULL); - buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); + dbtofsb(fs, bp->b_blkno), + fs->fs_bsize, ip->i_number, vp->v_type, NULL, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON); + bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS - if (!ffs_checkblk(ip, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG @@ -771,7 +799,7 @@ ffs_reallocblks_ufs2(ap) struct fs *fs; struct inode *ip; struct vnode *vp; - struct buf *sbp, *ebp; + struct buf *sbp, *ebp, *bp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; @@ -978,14 +1006,29 @@ ffs_reallocblks_ufs2(ap) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) + /* + * The usual case is that a set of N-contiguous blocks + * that was just allocated has been replaced with a + * set of N+1-contiguous blocks. If they are marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the blocks were written + * earlier, but very uncommon. If the blocks have never + * been written, there is no need to send a BIO_DELETE + * for them when they are freed. The gain from avoiding + * the TRIMs for the common case of unwritten blocks + * far exceeds the cost of the write amplification for + * the uncommon case of failing to send a TRIM for the + * blocks that had been written. + */ ffs_blkfree(ump, fs, ump->um_devvp, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number, vp->v_type, NULL); - buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); + dbtofsb(fs, bp->b_blkno), + fs->fs_bsize, ip->i_number, vp->v_type, NULL, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON); + bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS - if (!ffs_checkblk(ip, - dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG @@ -1823,8 +1866,7 @@ ffs_alloccgblk(ip, bp, bpref, size) /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) - softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, - size, 0); + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } @@ -2254,6 +2296,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) bdwrite(bp); } +/* + * Structures and routines associated with trim management. + */ +MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); + +#define TRIMLIST_HASH(ump, inum) \ + (&(ump)->um_trimhash[(inum) & (ump)->um_trimlisthashsize]) + +static void ffs_blkfree_trim_completed(struct buf *); +static void ffs_blkfree_trim_task(void *ctx, int pending __unused); + struct ffs_blkfree_trim_params { struct task task; struct ufsmount *ump; @@ -2277,7 +2330,7 @@ ffs_blkfree_trim_task(ctx, pending) tp->inum, tp->pdephd); vn_finished_secondary_write(UFSTOVFS(tp->ump)); atomic_add_int(&tp->ump->um_trim_inflight, -1); - free(tp, M_TEMP); + free(tp, M_TRIM); } static void @@ -2287,13 +2340,13 @@ ffs_blkfree_trim_completed(bp) struct ffs_blkfree_trim_params *tp; tp = bp->b_fsprivate1; - free(bp, M_TEMP); + free(bp, M_TRIM); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } void -ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) +ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, trimtype) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; @@ -2302,6 +2355,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) ino_t inum; enum vtype vtype; struct workhead *dephd; + int trimtype; { struct mount *mp; struct buf *bp; @@ -2319,10 +2373,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) return; } /* - * Nothing to delay if TRIM is disabled, or the operation is - * performed on the snapshot. + * Nothing to delay if TRIM is not required for this block or TRIM + * is disabled or the operation is performed on a snapshot. */ - if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) { + if (trimtype == NOTRIM || ((ump->um_flags & UM_CANDELETE) == 0) || + devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } @@ -2334,7 +2389,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) * and write some new data into it. */ atomic_add_int(&ump->um_trim_inflight, 1); - tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); + tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); tp->ump = ump; tp->devvp = devvp; tp->bno = bno; @@ -2347,7 +2402,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) } else tp->pdephd = NULL; - bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO); + bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_DELETE; bp->b_iooffset = dbtob(fsbtodb(fs, bno)); bp->b_iodone = ffs_blkfree_trim_completed; @@ -2824,7 +2879,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) long blkcnt, blksize; struct file *fp, *vfp; cap_rights_t rights; - int filetype, error; + int filetype, trimtype, error; static struct fileops *origops, bufferedops; if (req->newlen > sizeof cmd) @@ -2956,14 +3011,17 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); + trimtype = (blksize < blkcnt) ? STARTFREE : SINGLETON; while (blkcnt > 0) { if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL); + blksize * fs->fs_fsize, UFS_ROOTINO, + VDIR, NULL, trimtype); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; + trimtype = (blksize < blkcnt) ? CONTINUEFREE : ENDFREE; } break; diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 6143b4fca8c0..552c295753d7 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -553,7 +553,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, lbns_remfree++; #endif ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, SINGLETON); } return (error); } @@ -1147,7 +1147,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, lbns_remfree++; #endif ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, SINGLETON); } return (error); } diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 2df48ec91de9..e0a1a13c274f 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -63,7 +63,7 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size, struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, - ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *); + ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, int); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); @@ -111,11 +111,28 @@ vfs_vget_t ffs_vget; int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int); void process_deferred_inactive(struct mount *mp); +/* + * Flags to ffs_vgetf + */ #define FFSV_FORCEINSMQ 0x0001 +/* + * Flags to ffs_reload + */ #define FFSR_FORCE 0x0001 #define FFSR_UNSUSPEND 0x0002 +/* + * Trim type to ffs_blkfree - used to help with BIO_DELETE (trim) requests + */ +#define NOTRIM 1 /* never written, so don't call trim for it */ +#define SINGLETON 2 /* only block being freed, so trim it now */ +#define STARTFREE 3 /* beginning to free for this inum */ +#define CONTINUEFREE 4 /* additional block free for this inum */ +#define ENDFREE 5 /* last block to free for this inum */ + +#define MAXTRIMIO 1024 /* maximum expected outstanding trim requests */ + extern struct vop_vector ffs_vnodeops1; extern struct vop_vector ffs_fifoops1; extern struct vop_vector ffs_vnodeops2; diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 6a26ef97189a..3cf58558c185 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -195,7 +195,7 @@ ffs_truncate(vp, length, flags, cred) struct ufsmount *ump; int softdeptrunc, journaltrunc; int needextclean, extblocks; - int offset, size, level, nblocks; + int trimtype, firstfree, offset, size, level, nblocks; int i, error, allerror, indiroff, waitforupdate; off_t osize; @@ -275,7 +275,7 @@ ffs_truncate(vp, length, flags, cred) continue; ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], sblksize(fs, osize, i), ip->i_number, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON); } } } @@ -523,7 +523,7 @@ ffs_truncate(vp, length, flags, cred) DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ump->um_devvp, bn, fs->fs_bsize, ip->i_number, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON); blocksreleased += nblocks; } } @@ -534,6 +534,7 @@ ffs_truncate(vp, length, flags, cred) /* * All whole direct blocks or frags. */ + firstfree = 1; for (i = UFS_NDADDR - 1; i > lastblock; i--) { long bsize; @@ -542,8 +543,23 @@ ffs_truncate(vp, length, flags, cred) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); + if (firstfree) { + if (i - 1 == lastblock || DIP(ip, i_db[i - 1]) == 0) { + trimtype = SINGLETON; + } else { + trimtype = STARTFREE; + firstfree = 0; + } + } else { + if (i - 1 == lastblock || DIP(ip, i_db[i - 1]) == 0) { + trimtype = ENDFREE; + firstfree = 1; + } else { + trimtype = CONTINUEFREE; + } + } ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, - vp->v_type, NULL); + vp->v_type, NULL, trimtype); blocksreleased += btodb(bsize); } if (lastblock < 0) @@ -575,7 +591,8 @@ ffs_truncate(vp, length, flags, cred) */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ump->um_devvp, bn, - oldspace - newspace, ip->i_number, vp->v_type, NULL); + oldspace - newspace, ip->i_number, vp->v_type, + NULL, SINGLETON); blocksreleased += btodb(oldspace - newspace); } } @@ -636,7 +653,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) struct fs *fs; struct vnode *vp; caddr_t copy = NULL; - int i, nblocks, error = 0, allerror = 0; + int i, trimtype, nblocks, firstfree, error = 0, allerror = 0; ufs2_daddr_t nb, nlbn, last; ufs2_daddr_t blkcount, factor, blocksreleased = 0; ufs1_daddr_t *bap1 = NULL; @@ -719,6 +736,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) /* * Recursively free totally unused blocks. */ + firstfree = 1; for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = BAP(ip, i); @@ -730,8 +748,23 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) allerror = error; blocksreleased += blkcount; } + if (firstfree) { + if (i - 1 == last || BAP(ip, i - 1) == 0) { + trimtype = SINGLETON; + } else { + trimtype = STARTFREE; + firstfree = 0; + } + } else { + if (i - 1 == last || BAP(ip, i - 1) == 0) { + trimtype = ENDFREE; + firstfree = 1; + } else { + trimtype = CONTINUEFREE; + } + } ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize, - ip->i_number, vp->v_type, NULL); + ip->i_number, vp->v_type, NULL, trimtype); blocksreleased += nblocks; } diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index fed0456b13cb..e87e097177aa 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -583,7 +583,7 @@ ffs_snapshot(mp, snapfile) if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, - xvp->v_type, NULL); + xvp->v_type, NULL, SINGLETON); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } @@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON); } return (0); } @@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, - vp->v_type, NULL); + vp->v_type, NULL, SINGLETON); } return (0); } diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 4943555198db..6ddb8db12451 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -869,7 +869,7 @@ static void cancel_allocdirect(struct allocdirectlst *, struct allocdirect *, struct freeblks *); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); -static void freework_freeblock(struct freework *); +static void freework_freeblock(struct freework *, int); static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); @@ -884,7 +884,7 @@ static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, - ufs_lbn_t); + ufs_lbn_t, int); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct freefrag *allocindir_merge(struct allocindir *, @@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) - freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); + /* + * The usual case is that a smaller fragment that + * was just allocated has been replaced with a bigger + * fragment or a full-size block. If it is marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the block was written + * earlier, but very uncommon. If the block has never + * been written, there is no need to send a BIO_DELETE + * for it when it is freed. The gain from avoiding the + * TRIMs for the common case of unwritten blocks far + * exceeds the cost of the write amplification for the + * uncommon case of failing to send a TRIM for a block + * that had been written. + */ + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON); else freefrag = NULL; @@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn) * Allocate a new freefrag structure. */ static struct freefrag * -newfreefrag(ip, blkno, size, lbn) +newfreefrag(ip, blkno, size, lbn, trimtype) struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; + int trimtype; { struct freefrag *freefrag; struct ufsmount *ump; @@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn) freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; + freefrag->ff_trimtype = trimtype; if (MOUNTEDSUJ(UFSTOVFS(ump))) { freefrag->ff_jdep = (struct worklist *) @@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag) } FREE_LOCK(ump); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, - freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); + freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd, + freefrag->ff_trimtype); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(ump); @@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) lbn = bp->b_lblkno; if (oldblkno && oldblkno != newblkno) - freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); + /* + * The usual case is that a smaller fragment that + * was just allocated has been replaced with a bigger + * fragment or a full-size block. If it is marked as + * B_DELWRI, the current contents have not been written + * to disk. It is possible that the block was written + * earlier, but very uncommon. If the block has never + * been written, there is no need to send a BIO_DELETE + * for it when it is freed. The gain from avoiding the + * TRIMs for the common case of unwritten blocks far + * exceeds the cost of the write amplification for the + * uncommon case of failing to send a TRIM for a block + * that had been written. + */ + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, + (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON); else freefrag = NULL; @@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct jnewblk *jnewblk; if (oldblkno) - freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn); + freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn, + SINGLETON); else freefrag = NULL; ACQUIRE_LOCK(ITOUMP(ip)); @@ -7724,8 +7758,9 @@ free_inodedep(inodedep) * in memory immediately. */ static void -freework_freeblock(freework) +freework_freeblock(freework, trimtype) struct freework *freework; + int trimtype; { struct freeblks *freeblks; struct jnewblk *jnewblk; @@ -7779,10 +7814,10 @@ freework_freeblock(freework) FREE_LOCK(ump); freeblks_free(ump, freeblks, btodb(bsize)); CTR4(KTR_SUJ, - "freework_freeblock: ino %d blkno %jd lbn %jd size %ld", + "freework_freeblock: ino %jd blkno %jd lbn %jd size %d", freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, - freeblks->fb_inum, freeblks->fb_vtype, &wkhd); + freeblks->fb_inum, freeblks->fb_vtype, &wkhd, trimtype); ACQUIRE_LOCK(ump); /* * The jnewblk will be discarded and the bits in the map never @@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework) return; } if (freework->fw_off == NINDIR(fs)) { - freework_freeblock(freework); + freework_freeblock(freework, SINGLETON); return; } freework->fw_state |= INPROGRESS; @@ -7889,16 +7924,19 @@ handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { - struct freework *freework; + struct freework *freework, *prevfreework; struct newblk *newblk; struct allocindir *aip; struct ufsmount *ump; struct worklist *wk; + int trimtype; KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), ("handle_workitem_freeblocks: Journal entries not written.")); ump = VFSTOUFS(freeblks->fb_list.wk_mp); ACQUIRE_LOCK(ump); + prevfreework = NULL; + trimtype = 0; while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { @@ -7932,16 +7970,26 @@ handle_workitem_freeblocks(freeblks, flags) case D_FREEWORK: freework = WK_FREEWORK(wk); - if (freework->fw_lbn <= -UFS_NDADDR) + if (freework->fw_lbn <= -UFS_NDADDR) { handle_workitem_indirblk(freework); - else - freework_freeblock(freework); + continue; + } else if (prevfreework == NULL) { + trimtype = SINGLETON; + } else if (trimtype == SINGLETON) { + freework_freeblock(prevfreework, STARTFREE); + trimtype = ENDFREE; + } else { + freework_freeblock(prevfreework, CONTINUEFREE); + } + prevfreework = freework; continue; default: panic("handle_workitem_freeblocks: Unknown type %s", TYPENAME(wk->wk_type)); } } + if (prevfreework != NULL) + freework_freeblock(prevfreework, trimtype); if (freeblks->fb_ref != 0) { freeblks->fb_state &= ~INPROGRESS; wake_worklist(&freeblks->fb_list); @@ -8080,13 +8128,8 @@ indir_trunc(freework, dbn, lbn) ufs1_daddr_t *bap1; ufs2_daddr_t nb, nnb, *bap2; ufs_lbn_t lbnadd, nlbn; - int i, nblocks, ufs1fmt; - int freedblocks; - int goingaway; - int freedeps; - int needj; - int level; - int cnt; + int nblocks, ufs1fmt, firstfree, trimtype, freedblocks; + int goingaway, freedeps, needj, level, cnt, i; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); @@ -8180,6 +8223,7 @@ indir_trunc(freework, dbn, lbn) * arranges for the current level to be freed when subordinates * are free when journaling. */ + firstfree = 1; for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { if (i != NINDIR(fs) - 1) { if (ufs1fmt) @@ -8215,11 +8259,26 @@ indir_trunc(freework, dbn, lbn) freedeps++; } CTR3(KTR_SUJ, - "indir_trunc: ino %d blkno %jd size %ld", + "indir_trunc: ino %jd blkno %jd size %d", freeblks->fb_inum, nb, fs->fs_bsize); + if (firstfree) { + if (i == NINDIR(fs) - 1 || nnb == 0) { + trimtype = SINGLETON; + } else { + trimtype = STARTFREE; + firstfree = 0; + } + } else { + if (i == NINDIR(fs) - 1 || nnb == 0) { + trimtype = ENDFREE; + firstfree = 1; + } else { + trimtype = CONTINUEFREE; + } + } ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_inum, - freeblks->fb_vtype, &wkhd); + freeblks->fb_vtype, &wkhd, trimtype); } } if (goingaway) { @@ -8244,7 +8303,7 @@ indir_trunc(freework, dbn, lbn) if (level == 0) freeblks->fb_cgwait += freedeps; if (freework->fw_ref == 0) - freework_freeblock(freework); + freework_freeblock(freework, SINGLETON); FREE_LOCK(ump); return; } @@ -8253,10 +8312,10 @@ indir_trunc(freework, dbn, lbn) */ dbn = dbtofsb(fs, dbn); CTR3(KTR_SUJ, - "indir_trunc 2: ino %d blkno %jd size %ld", + "indir_trunc 2: ino %jd blkno %jd size %d", freeblks->fb_inum, dbn, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, - freeblks->fb_inum, freeblks->fb_vtype, NULL); + freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON); /* Non SUJ softdep does single-threaded truncations. */ if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 9ed5c58f7b0e..e3927327c79c 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td) taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); + ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, + &ump->um_trimlisthashsize); } } @@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); + free (ump->um_trimhash, M_TRIM); } g_topology_lock(); if (ump->um_fsckpid > 0) { diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index 707429fe68c6..5e1fa4ff3680 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -557,6 +557,7 @@ struct freefrag { long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ enum vtype ff_vtype; /* owning inode's file type */ + int ff_trimtype; /* trim status when deleted */ }; /* diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 1958b02f6abb..2b6896013bd2 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -47,6 +47,7 @@ struct ufs_args { #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_UFSMNT); +MALLOC_DECLARE(M_TRIM); #endif struct buf; @@ -63,6 +64,7 @@ struct inodedep; TAILQ_HEAD(inodedeplst, inodedep); LIST_HEAD(bmsafemaphd, bmsafemap); +LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params); /* * This structure describes the UFS specific mount structure data. @@ -101,6 +103,8 @@ struct ufsmount { u_int um_flags; /* (i) filesystem flags */ u_int um_trim_inflight; /* (a) outstanding trim count */ struct taskqueue *um_trim_tq; /* (c) trim request queue */ + struct trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */ + u_long um_trimlisthashsize; /* (i) trim hash table size-1 */ /* (c) - below function ptrs */ int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **);