From 0012f373e43db2341c20329163ed2d5ad3b0f341 Mon Sep 17 00:00:00 2001 From: Jeff Roberson Date: Tue, 15 Oct 2019 03:45:41 +0000 Subject: [PATCH] (4/6) Protect page valid with the busy lock. Atomics are used for page busy and valid state when the shared busy is held. The details of the locking protocol and valid and dirty synchronization are in the updated vm_page.h comments. Reviewed by: kib, markj Tested by: pho Sponsored by: Netflix, Intel Differential Revision: https://reviews.freebsd.org/D21594 --- sys/amd64/sgx/sgx.c | 4 +- .../opensolaris/uts/common/fs/zfs/dmu.c | 24 +- .../opensolaris/uts/common/fs/zfs/zfs_vnops.c | 15 +- sys/compat/linuxkpi/common/src/linux_compat.c | 2 +- sys/dev/drm2/ttm/ttm_bo_vm.c | 2 +- sys/dev/drm2/ttm/ttm_tt.c | 2 +- sys/dev/md/md.c | 10 +- sys/dev/netmap/netmap_freebsd.c | 2 +- sys/dev/xen/gntdev/gntdev.c | 2 +- sys/dev/xen/privcmd/privcmd.c | 2 +- sys/fs/nfsclient/nfs_clbio.c | 6 +- sys/fs/smbfs/smbfs_io.c | 6 +- sys/fs/tmpfs/tmpfs_subr.c | 2 +- sys/kern/kern_exec.c | 8 +- sys/kern/uipc_shm.c | 4 +- sys/kern/vfs_bio.c | 31 ++- sys/kern/vfs_cluster.c | 12 +- sys/vm/device_pager.c | 2 +- sys/vm/phys_pager.c | 10 +- sys/vm/sg_pager.c | 2 +- sys/vm/swap_pager.c | 4 +- sys/vm/vm_fault.c | 19 +- sys/vm/vm_map.c | 2 +- sys/vm/vm_mmap.c | 4 +- sys/vm/vm_object.c | 21 +- sys/vm/vm_page.c | 219 ++++++++++++------ sys/vm/vm_page.h | 121 ++++++---- sys/vm/vm_pageout.c | 6 +- sys/vm/vm_swapout.c | 4 +- sys/vm/vnode_pager.c | 19 +- 30 files changed, 369 insertions(+), 198 deletions(-) diff --git a/sys/amd64/sgx/sgx.c b/sys/amd64/sgx/sgx.c index ea18c9674234..8d100386e383 100644 --- a/sys/amd64/sgx/sgx.c +++ b/sys/amd64/sgx/sgx.c @@ -220,8 +220,8 @@ sgx_va_slot_init_by_index(struct sgx_softc *sc, vm_object_t object, page = PHYS_TO_VM_PAGE(epc->phys); - vm_page_insert(page, object, idx); page->valid = VM_PAGE_BITS_ALL; + vm_page_insert(page, object, idx); } return (0); @@ -610,8 +610,8 @@ sgx_insert_epc_page_by_index(vm_page_t page, vm_object_t object, VM_OBJECT_ASSERT_WLOCKED(object); - vm_page_insert(page, object, pidx); page->valid = VM_PAGE_BITS_ALL; + vm_page_insert(page, object, pidx); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 80eca9a76790..aaaf8394f0d1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1731,11 +1731,13 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, db = dbp[0]; for (i = 0; i < *rbehind; i++) { m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i, - VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | + VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); if (m == NULL) break; - if (m->valid != 0) { + if (!vm_page_none_valid(m)) { ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_sunbusy(m); break; } ASSERT(m->dirty == 0); @@ -1746,13 +1748,14 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, va = zfs_map_page(m, &sf); bcopy((char *)db->db_data + bufoff, va, PAGESIZE); zfs_unmap_page(sf); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); + vm_page_sunbusy(m); } *rbehind = i; @@ -1763,7 +1766,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, m = ma[mi]; if (m != bogus_page) { vm_page_assert_xbusied(m); - ASSERT(m->valid == 0); + ASSERT(vm_page_none_valid(m)); ASSERT(m->dirty == 0); ASSERT(!pmap_page_is_mapped(m)); va = zfs_map_page(m, &sf); @@ -1791,7 +1794,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, if (pgoff == PAGESIZE) { if (m != bogus_page) { zfs_unmap_page(sf); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); } ASSERT(mi < count); mi++; @@ -1840,16 +1843,18 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, ASSERT(m != bogus_page); bzero(va + pgoff, PAGESIZE - pgoff); zfs_unmap_page(sf); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); } for (i = 0; i < *rahead; i++) { m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i, - VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | + VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); if (m == NULL) break; - if (m->valid != 0) { + if (!vm_page_none_valid(m)) { ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_sunbusy(m); break; } ASSERT(m->dirty == 0); @@ -1866,13 +1871,14 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, bzero(va + tocpy, PAGESIZE - tocpy); } zfs_unmap_page(sf); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); + vm_page_sunbusy(m); } *rahead = i; zfs_vmobject_wunlock(vmobj); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 54f5482aa773..7d50de506169 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -534,7 +534,7 @@ mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); - if (pp->valid == 0) { + if (vm_page_none_valid(pp)) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, @@ -543,17 +543,16 @@ mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); - vm_page_sunbusy(pp); - if (error) { - if (!vm_page_busied(pp) && !vm_page_wired(pp) && - pp->valid == 0) - vm_page_free(pp); - } else { - pp->valid = VM_PAGE_BITS_ALL; + if (error == 0) { + vm_page_valid(pp); vm_page_lock(pp); vm_page_activate(pp); vm_page_unlock(pp); } + vm_page_sunbusy(pp); + if (error != 0 && !vm_page_wired(pp) == 0 && + pp->valid == 0 && vm_page_tryxbusy(pp)) + vm_page_free(pp); } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index 3ccdf87c3772..10d0866c2545 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -514,7 +514,7 @@ linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_free(*mres); *mres = page; } - page->valid = VM_PAGE_BITS_ALL; + vm_page_valid(page); return (VM_PAGER_OK); } return (VM_PAGER_FAIL); diff --git a/sys/dev/drm2/ttm/ttm_bo_vm.c b/sys/dev/drm2/ttm/ttm_bo_vm.c index 3538831bbdfd..0f35dac3a1d9 100644 --- a/sys/dev/drm2/ttm/ttm_bo_vm.c +++ b/sys/dev/drm2/ttm/ttm_bo_vm.c @@ -252,7 +252,7 @@ ttm_bo_vm_fault(vm_object_t vm_obj, vm_ooffset_t offset, ("inconsistent insert bo %p m %p m1 %p offset %jx", bo, m, m1, (uintmax_t)offset)); } - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); if (*mres != NULL) { KASSERT(*mres != m, ("losing %p %p", *mres, m)); vm_page_free(*mres); diff --git a/sys/dev/drm2/ttm/ttm_tt.c b/sys/dev/drm2/ttm/ttm_tt.c index ec3aed665e3b..7b2a2d03e678 100644 --- a/sys/dev/drm2/ttm/ttm_tt.c +++ b/sys/dev/drm2/ttm/ttm_tt.c @@ -344,7 +344,7 @@ int ttm_tt_swapout(struct ttm_tt *ttm, vm_object_t persistent_swap_storage) continue; to_page = vm_page_grab(obj, i, VM_ALLOC_NORMAL); pmap_copy_page(from_page, to_page); - to_page->valid = VM_PAGE_BITS_ALL; + vm_page_valid(to_page); vm_page_dirty(to_page); vm_page_xunbusy(to_page); } diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index 110cbfdecc87..cf9771a833f6 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -1074,7 +1074,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM); if (bp->bio_cmd == BIO_READ) { - if (m->valid == VM_PAGE_BITS_ALL) + if (vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, @@ -1090,7 +1090,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) * can be recreated if thrown out. */ pmap_zero_page(m); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); } if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(&m, offs, bp->bio_ma, @@ -1104,7 +1104,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) cpu_flush_dcache(p, len); } } else if (bp->bio_cmd == BIO_WRITE) { - if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL) + if (len == PAGE_SIZE || vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, @@ -1125,13 +1125,13 @@ mdstart_swap(struct md_s *sc, struct bio *bp) physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); } - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); if (m->dirty != VM_PAGE_BITS_ALL) { vm_page_dirty(m); vm_pager_page_unswapped(m); } } else if (bp->bio_cmd == BIO_DELETE) { - if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL) + if (len == PAGE_SIZE || vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 42551df09c2a..2580144ab7fe 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -1056,7 +1056,7 @@ netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, *mres = page; vm_page_insert(page, object, pidx); } - page->valid = VM_PAGE_BITS_ALL; + vm_page_valid(page); return (VM_PAGER_OK); } diff --git a/sys/dev/xen/gntdev/gntdev.c b/sys/dev/xen/gntdev/gntdev.c index c9e42c1fd02b..eb5275771a80 100644 --- a/sys/dev/xen/gntdev/gntdev.c +++ b/sys/dev/xen/gntdev/gntdev.c @@ -836,8 +836,8 @@ gntdev_gmap_pg_fault(vm_object_t object, vm_ooffset_t offset, int prot, } vm_page_busy_acquire(page, 0); + vm_page_valid(page); vm_page_insert(page, object, pidx); - page->valid = VM_PAGE_BITS_ALL; *mres = page; return (VM_PAGER_OK); } diff --git a/sys/dev/xen/privcmd/privcmd.c b/sys/dev/xen/privcmd/privcmd.c index e424dc20d905..75e87df9dc57 100644 --- a/sys/dev/xen/privcmd/privcmd.c +++ b/sys/dev/xen/privcmd/privcmd.c @@ -179,8 +179,8 @@ privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset, } vm_page_busy_acquire(page, 0); + vm_page_valid(page); vm_page_insert(page, object, pidx); - page->valid = VM_PAGE_BITS_ALL; *mres = page; return (VM_PAGER_OK); } diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c index 07fec833b2d5..d8d4600c3ff9 100644 --- a/sys/fs/nfsclient/nfs_clbio.c +++ b/sys/fs/nfsclient/nfs_clbio.c @@ -174,7 +174,7 @@ ncl_getpages(struct vop_getpages_args *ap) * XXXGL: is that true for NFS, where short read can occur??? */ VM_OBJECT_WLOCK(object); - if (pages[npages - 1]->valid != 0 && --npages == 0) + if (!vm_page_none_valid(pages[npages - 1]) && --npages == 0) goto out; VM_OBJECT_WUNLOCK(object); @@ -227,14 +227,14 @@ ncl_getpages(struct vop_getpages_args *ap) /* * Read operation filled an entire page */ - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); KASSERT(m->dirty == 0, ("nfs_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ - m->valid = 0; + vm_page_invalid(m); vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("nfs_getpages: page %p is dirty", m)); diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c index fa6a14024213..ab1eb0574c1b 100644 --- a/sys/fs/smbfs/smbfs_io.c +++ b/sys/fs/smbfs/smbfs_io.c @@ -457,7 +457,7 @@ smbfs_getpages(ap) * XXXGL: is that true for SMB filesystem? */ VM_OBJECT_WLOCK(object); - if (pages[npages - 1]->valid != 0 && --npages == 0) + if (!vm_page_none_valid(pages[npages - 1]) && --npages == 0) goto out; VM_OBJECT_WUNLOCK(object); @@ -505,14 +505,14 @@ smbfs_getpages(ap) /* * Read operation filled an entire page */ - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); KASSERT(m->dirty == 0, ("smbfs_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ - m->valid = 0; + vm_page_invalid(m); vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("smbfs_getpages: page %p is dirty", m)); diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index c5b2286b1ece..b26a3906af15 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -1408,7 +1408,7 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) retry: m = vm_page_grab(uobj, idx, VM_ALLOC_NOCREAT); if (m != NULL) { - MPASS(m->valid == VM_PAGE_BITS_ALL); + MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(uobj, idx, NULL, NULL)) { m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index e1c647ca4f00..f2a6c1143cea 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -979,11 +979,15 @@ exec_map_first_page(struct image_params *imgp) retry: ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); - if (ma[0]->valid != VM_PAGE_BITS_ALL) { + if (!vm_page_all_valid(ma[0])) { if (vm_page_busy_acquire(ma[0], VM_ALLOC_WAITFAIL) == 0) { vm_page_unwire_noq(ma[0]); goto retry; } + if (vm_page_all_valid(ma[0])) { + vm_page_xunbusy(ma[0]); + goto out; + } if (!vm_pager_has_page(object, 0, NULL, &after)) { if (vm_page_unwire_noq(ma[0])) vm_page_free(ma[0]); @@ -1029,6 +1033,8 @@ exec_map_first_page(struct image_params *imgp) for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } + +out: VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index b0aaac0659a5..e8d1c8f5c40d 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -459,7 +459,7 @@ shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) retry: m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); if (m != NULL) { - MPASS(m->valid == VM_PAGE_BITS_ALL); + MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); @@ -485,7 +485,7 @@ shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); - KASSERT(m->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(m), ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_page_xunbusy(m); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index baeaf2e32dc0..3888942a1889 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -956,6 +956,12 @@ vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, { VM_OBJECT_ASSERT_LOCKED(m->object); + + /* + * This function and its results are protected by higher level + * synchronization requiring vnode and buf locks to page in and + * validate pages. + */ if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) @@ -4640,7 +4646,7 @@ vfs_busy_pages(struct buf *bp, int clear_modify) if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); - } else if (m->valid == VM_PAGE_BITS_ALL && + } else if (vm_page_all_valid(m) && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; @@ -4681,6 +4687,14 @@ vfs_bio_set_valid(struct buf *bp, int base, int size) n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + + /* + * Busy may not be strictly necessary here because the pages are + * unlikely to be fully valid and the vnode lock will synchronize + * their access via getpages. It is grabbed for consistency with + * other page validation. + */ + vfs_busy_pages_acquire(bp); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) @@ -4690,6 +4704,7 @@ vfs_bio_set_valid(struct buf *bp, int base, int size) size -= n; n = PAGE_SIZE; } + vfs_busy_pages_release(bp); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } @@ -4717,6 +4732,7 @@ vfs_bio_clrbuf(struct buf *bp) bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + vfs_busy_pages_acquire(bp); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) @@ -4758,6 +4774,7 @@ vfs_bio_clrbuf(struct buf *bp) bp->b_pages[i]->valid |= mask; } unlock: + vfs_busy_pages_release(bp); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } @@ -5189,7 +5206,7 @@ vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, * the end of the function catches the race in a * reliable way (protected by the object lock). */ - if (m->valid == VM_PAGE_BITS_ALL) + if (vm_page_all_valid(m)) continue; poff = IDX_TO_OFF(m->pindex); @@ -5219,7 +5236,7 @@ vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, * cache pressure. */ if (buf_pager_relbuf || - m->valid != VM_PAGE_BITS_ALL) + !vm_page_all_valid(m)) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; @@ -5229,12 +5246,12 @@ vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, } } KASSERT(1 /* racy, enable for debugging */ || - m->valid == VM_PAGE_BITS_ALL || i == count - 1, + vm_page_all_valid(m) || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); - if (m->valid != 0 && - m->valid != VM_PAGE_BITS_ALL) + if (!vm_page_none_valid(m) && + !vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } @@ -5261,7 +5278,7 @@ next_page:; * invalidated or removed, so we must restart for * safety as well. */ - if (ma[i]->valid != VM_PAGE_BITS_ALL) + if (!vm_page_all_valid(ma[i])) redo = true; } if (redo && error == 0) diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 6a87dd28d57b..cb403bb81d24 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -465,11 +465,13 @@ cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, if (toff + tinc > PAGE_SIZE) tinc = PAGE_SIZE - toff; VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object); - if ((tbp->b_pages[j]->valid & - vm_page_bits(toff, tinc)) != 0) - break; if (vm_page_trysbusy(tbp->b_pages[j]) == 0) break; + if ((tbp->b_pages[j]->valid & + vm_page_bits(toff, tinc)) != 0) { + vm_page_sunbusy(tbp->b_pages[j]); + break; + } vm_object_pip_add(tbp->b_bufobj->bo_object, 1); off += tinc; tsize -= tinc; @@ -524,7 +526,7 @@ cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, bp->b_pages[bp->b_npages] = m; bp->b_npages++; } - if (m->valid == VM_PAGE_BITS_ALL) + if (vm_page_all_valid(m)) tbp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); @@ -548,7 +550,7 @@ cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (j = 0; j < bp->b_npages; j++) { VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object); - if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL) + if (vm_page_all_valid(bp->b_pages[j])) bp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index cc6898fe235d..f7afe67911f3 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -395,7 +395,7 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_free(*mres); *mres = page; } - page->valid = VM_PAGE_BITS_ALL; + vm_page_valid(page); return (VM_PAGER_OK); } diff --git a/sys/vm/phys_pager.c b/sys/vm/phys_pager.c index 43e63ec1d5dd..7da2adf823d1 100644 --- a/sys/vm/phys_pager.c +++ b/sys/vm/phys_pager.c @@ -145,12 +145,12 @@ phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, VM_OBJECT_ASSERT_WLOCKED(object); for (i = 0; i < count; i++) { - if (m[i]->valid == 0) { + if (vm_page_none_valid(m[i])) { if ((m[i]->flags & PG_ZERO) == 0) pmap_zero_page(m[i]); - m[i]->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m[i]); } - KASSERT(m[i]->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(m[i]), ("phys_pager_getpages: partially valid page %p", m[i])); KASSERT(m[i]->dirty == 0, ("phys_pager_getpages: dirty page %p", m[i])); @@ -209,10 +209,8 @@ phys_pager_populate(vm_object_t object, vm_pindex_t pidx, ahead = MIN(end - i, PHYSALLOC); m = vm_page_grab(object, i, VM_ALLOC_NORMAL | VM_ALLOC_COUNT(ahead)); - if (m->valid != VM_PAGE_BITS_ALL) { + if (!vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); - m->valid = VM_PAGE_BITS_ALL; - } KASSERT(m->dirty == 0, ("phys_pager_populate: dirty page %p", m)); } diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c index a31c2bb8fd27..520476a4b331 100644 --- a/sys/vm/sg_pager.c +++ b/sys/vm/sg_pager.c @@ -198,7 +198,7 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, vm_page_free(m[0]); vm_page_unlock(m[0]); m[0] = page; - page->valid = VM_PAGE_BITS_ALL; + vm_page_valid(page); if (rbehind) *rbehind = 0; diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 588c7aa4cb56..f2100fec560e 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1554,7 +1554,7 @@ swp_pager_async_iodone(struct buf *bp) * be overridden by the original caller of * getpages so don't play cute tricks here. */ - m->valid = 0; + vm_page_invalid(m); } else { /* * If a write error occurs, reactivate page @@ -1582,7 +1582,7 @@ swp_pager_async_iodone(struct buf *bp) KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index b482bdb6fffb..37dd49672660 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -211,6 +211,7 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot, return; VM_OBJECT_ASSERT_LOCKED(m->object); + VM_PAGE_OBJECT_BUSY_ASSERT(m); need_dirty = ((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_WIRE) == 0) || @@ -285,7 +286,7 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && - vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) { + vm_page_busied(m)) || !vm_page_all_valid(m)) { rv = KERN_FAILURE; goto out; } @@ -368,7 +369,7 @@ vm_fault_populate_check_page(vm_page_t m) * valid, and exclusively busied. */ MPASS(m != NULL); - MPASS(m->valid == VM_PAGE_BITS_ALL); + MPASS(vm_page_all_valid(m)); MPASS(vm_page_xbusied(m)); } @@ -830,7 +831,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, * (readable), jump to readrest, else break-out ( we * found the page ). */ - if (fs.m->valid != VM_PAGE_BITS_ALL) + if (!vm_page_all_valid(fs.m)) goto readrest; break; /* break to PAGE HAS BEEN FOUND */ } @@ -1154,7 +1155,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, VM_CNT_INC(v_ozfod); } VM_CNT_INC(v_zfod); - fs.m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(fs.m); /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND */ @@ -1245,7 +1246,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); - fs.first_m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(fs.first_m); if (wired && (fault_flags & VM_FAULT_WIRE) == 0) { vm_page_wire(fs.first_m); @@ -1364,7 +1365,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ - KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); @@ -1480,7 +1481,7 @@ vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); - if (m->valid != VM_PAGE_BITS_ALL || + if (!vm_page_all_valid(m) || vm_page_busied(m)) continue; @@ -1577,7 +1578,7 @@ vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, VM_OBJECT_RUNLOCK(lobject); break; } - if (m->valid == VM_PAGE_BITS_ALL && + if (vm_page_all_valid(m) && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); if (!obj_locked || lobject != entry->object.vm_object) @@ -1852,7 +1853,7 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, * all copies of the wired map entry have similar * backing pages. */ - if (dst_m->valid == VM_PAGE_BITS_ALL) { + if (vm_page_all_valid(dst_m)) { pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index cec76b76a4a7..12a1c2c24060 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -2358,7 +2358,7 @@ vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, psize = tmpidx; break; } - if (p->valid == VM_PAGE_BITS_ALL) { + if (vm_page_all_valid(p)) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 23bdde163cec..fd31def69017 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -893,7 +893,7 @@ kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) } } else vm_page_unlock(m); - KASSERT(m->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(m), ("mincore: page %p is mapped but invalid", m)); } else if (mincoreinfo == 0) { @@ -915,7 +915,7 @@ kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); - if (m != NULL && m->valid == 0) + if (m != NULL && vm_page_none_valid(m)) m = NULL; if (m != NULL) mincoreinfo = MINCORE_INCORE; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 63ca4a829e96..3d1401872064 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -841,7 +841,7 @@ vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, if (pi >= tend) break; np = TAILQ_NEXT(p, listq); - if (p->valid == 0) + if (vm_page_none_valid(p)) continue; if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) { if (object->generation != curgeneration) { @@ -1161,10 +1161,10 @@ vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, } /* - * If the page is not in a normal state, skip it. + * If the page is not in a normal state, skip it. The page + * can not be invalidated while the object lock is held. */ - if (tm->valid != VM_PAGE_BITS_ALL || - vm_page_wired(tm)) + if (!vm_page_all_valid(tm) || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); @@ -1488,7 +1488,11 @@ vm_object_scan_all_shadowed(vm_object_t object) * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); - if ((pp == NULL || pp->valid == 0) && + /* + * The valid check here is stable due to object lock being + * required to clear valid and initiate paging. + */ + if ((pp == NULL || vm_page_none_valid(pp)) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } @@ -1567,7 +1571,7 @@ vm_object_collapse_scan(vm_object_t object, int op) continue; } - KASSERT(pp == NULL || pp->valid != 0, + KASSERT(pp == NULL || !vm_page_none_valid(pp), ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, @@ -1894,7 +1898,7 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { - p->valid = 0; + vm_page_invalid(p); vm_page_undirty(p); } vm_page_xunbusy(p); @@ -1902,7 +1906,8 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); - if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { + if ((options & OBJPR_CLEANONLY) != 0 && + !vm_page_none_valid(p)) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 462f482950f7..e46bda993beb 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1330,7 +1330,7 @@ vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ - KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); + KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should @@ -1430,8 +1430,7 @@ vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ - KASSERT(m->valid == VM_PAGE_BITS_ALL, - ("vm_page_dirty: page is invalid!")); + KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } @@ -2657,7 +2656,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ - if (m->valid != 0) { + if (!vm_page_none_valid(m)) { /* * First, try to allocate a new page * that is above "high". Failing @@ -4334,7 +4333,7 @@ vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int al * However, we will not end up with an invalid page and a * shared lock. */ - if (m->valid != VM_PAGE_BITS_ALL || + if (!vm_page_all_valid(m) || (allocflags & (VM_ALLOC_IGN_SBUSY | VM_ALLOC_SBUSY)) == 0) { sleep = !vm_page_tryxbusy(m); xbusy = true; @@ -4354,7 +4353,7 @@ vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int al goto retrylookup; } if ((allocflags & VM_ALLOC_NOCREAT) != 0 && - m->valid != VM_PAGE_BITS_ALL) { + !vm_page_all_valid(m)) { if (xbusy) vm_page_xunbusy(m); else @@ -4364,7 +4363,7 @@ vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int al } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); - if (m->valid == VM_PAGE_BITS_ALL) + if (vm_page_all_valid(m)) goto out; } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; @@ -4386,7 +4385,7 @@ vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int al *mp = NULL; return (rv); } - MPASS(m->valid == VM_PAGE_BITS_ALL); + MPASS(vm_page_all_valid(m)); } else { vm_page_zero_invalid(m, TRUE); } @@ -4499,10 +4498,11 @@ vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, goto retrylookup; } } - if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) { + if (vm_page_none_valid(m) && + (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); } if ((allocflags & VM_ALLOC_NOBUSY) != 0) { if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) @@ -4542,6 +4542,72 @@ vm_page_bits(int base, int size) ((vm_page_bits_t)1 << first_bit)); } +static inline void +vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) +{ + +#if PAGE_SIZE == 32768 + atomic_set_64((uint64_t *)bits, set); +#elif PAGE_SIZE == 16384 + atomic_set_32((uint32_t *)bits, set); +#elif (PAGE_SIZE == 8192) && defined(atomic_set_16) + atomic_set_16((uint16_t *)bits, set); +#elif (PAGE_SIZE == 4096) && defined(atomic_set_8) + atomic_set_8((uint8_t *)bits, set); +#else /* PAGE_SIZE <= 8192 */ + uintptr_t addr; + int shift; + + addr = (uintptr_t)bits; + /* + * Use a trick to perform a 32-bit atomic on the + * containing aligned word, to not depend on the existence + * of atomic_{set, clear}_{8, 16}. + */ + shift = addr & (sizeof(uint32_t) - 1); +#if BYTE_ORDER == BIG_ENDIAN + shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; +#else + shift *= NBBY; +#endif + addr &= ~(sizeof(uint32_t) - 1); + atomic_set_32((uint32_t *)addr, set << shift); +#endif /* PAGE_SIZE */ +} + +static inline void +vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) +{ + +#if PAGE_SIZE == 32768 + atomic_clear_64((uint64_t *)bits, clear); +#elif PAGE_SIZE == 16384 + atomic_clear_32((uint32_t *)bits, clear); +#elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) + atomic_clear_16((uint16_t *)bits, clear); +#elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) + atomic_clear_8((uint8_t *)bits, clear); +#else /* PAGE_SIZE <= 8192 */ + uintptr_t addr; + int shift; + + addr = (uintptr_t)bits; + /* + * Use a trick to perform a 32-bit atomic on the + * containing aligned word, to not depend on the existence + * of atomic_{set, clear}_{8, 16}. + */ + shift = addr & (sizeof(uint32_t) - 1); +#if BYTE_ORDER == BIG_ENDIAN + shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; +#else + shift *= NBBY; +#endif + addr &= ~(sizeof(uint32_t) - 1); + atomic_clear_32((uint32_t *)addr, clear << shift); +#endif /* PAGE_SIZE */ +} + /* * vm_page_set_valid_range: * @@ -4556,8 +4622,9 @@ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; + vm_page_bits_t pagebits; - VM_OBJECT_ASSERT_WLOCKED(m->object); + vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; @@ -4591,7 +4658,11 @@ vm_page_set_valid_range(vm_page_t m, int base, int size) /* * Set valid bits inclusive of any overlap. */ - m->valid |= vm_page_bits(base, size); + pagebits = vm_page_bits(base, size); + if (vm_page_xbusied(m)) + m->valid |= pagebits; + else + vm_page_bits_set(m, &m->valid, pagebits); } /* @@ -4600,52 +4671,20 @@ vm_page_set_valid_range(vm_page_t m, int base, int size) static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { - uintptr_t addr; -#if PAGE_SIZE < 16384 - int shift; -#endif + + vm_page_assert_busied(m); /* - * If the object is locked and the page is neither exclusive busy nor - * write mapped, then the page's dirty field cannot possibly be - * set by a concurrent pmap operation. + * If the page is xbusied and not write mapped we are the + * only thread that can modify dirty bits. Otherwise, The pmap + * layer can call vm_page_dirty() without holding a distinguished + * lock. The combination of page busy and atomic operations + * suffice to guarantee consistency of the page dirty field. */ - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) + if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; - else { - /* - * The pmap layer can call vm_page_dirty() without - * holding a distinguished lock. The combination of - * the object's lock and an atomic operation suffice - * to guarantee consistency of the page dirty field. - * - * For PAGE_SIZE == 32768 case, compiler already - * properly aligns the dirty field, so no forcible - * alignment is needed. Only require existence of - * atomic_clear_64 when page size is 32768. - */ - addr = (uintptr_t)&m->dirty; -#if PAGE_SIZE == 32768 - atomic_clear_64((uint64_t *)addr, pagebits); -#elif PAGE_SIZE == 16384 - atomic_clear_32((uint32_t *)addr, pagebits); -#else /* PAGE_SIZE <= 8192 */ - /* - * Use a trick to perform a 32-bit atomic on the - * containing aligned word, to not depend on the existence - * of atomic_clear_{8, 16}. - */ - shift = addr & (sizeof(uint32_t) - 1); -#if BYTE_ORDER == BIG_ENDIAN - shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; -#else - shift *= NBBY; -#endif - addr &= ~(sizeof(uint32_t) - 1); - atomic_clear_32((uint32_t *)addr, pagebits << shift); -#endif /* PAGE_SIZE */ - } + else + vm_page_bits_clear(m, &m->dirty, pagebits); } /* @@ -4664,7 +4703,9 @@ vm_page_set_validclean(vm_page_t m, int base, int size) vm_page_bits_t oldvalid, pagebits; int endoff, frag; + /* Object lock for VPO_NOSYNC */ VM_OBJECT_ASSERT_WLOCKED(m->object); + vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; @@ -4701,7 +4742,10 @@ vm_page_set_validclean(vm_page_t m, int base, int size) */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); - m->valid |= pagebits; + if (vm_page_xbusied(m)) + m->valid |= pagebits; + else + vm_page_bits_set(m, &m->valid, pagebits); #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; @@ -4730,7 +4774,7 @@ vm_page_set_validclean(vm_page_t m, int base, int size) pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; - } else if (oldvalid != VM_PAGE_BITS_ALL) + } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); @@ -4755,21 +4799,53 @@ vm_page_set_invalid(vm_page_t m, int base, int size) vm_page_bits_t bits; vm_object_t object; + /* + * The object lock is required so that pages can't be mapped + * read-only while we're in the process of invalidating them. + */ object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); + vm_page_assert_busied(m); + if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); - if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && - bits != 0) + if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) pmap_remove_all(m); - KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || + KASSERT((bits == 0 && vm_page_all_valid(m)) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); - m->valid &= ~bits; - m->dirty &= ~bits; + if (vm_page_xbusied(m)) { + m->valid &= ~bits; + m->dirty &= ~bits; + } else { + vm_page_bits_clear(m, &m->valid, bits); + vm_page_bits_clear(m, &m->dirty, bits); + } +} + +/* + * vm_page_invalid: + * + * Invalidates the entire page. The page must be busy, unmapped, and + * the enclosing object must be locked. The object locks protects + * against concurrent read-only pmap enter which is done without + * busy. + */ +void +vm_page_invalid(vm_page_t m) +{ + + vm_page_assert_busied(m); + VM_OBJECT_ASSERT_LOCKED(m->object); + MPASS(!pmap_page_is_mapped(m)); + + if (vm_page_xbusied(m)) + m->valid = 0; + else + vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); } /* @@ -4789,7 +4865,6 @@ vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) int b; int i; - VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the @@ -4813,7 +4888,7 @@ vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); } /* @@ -4822,13 +4897,16 @@ vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. + * + * Some callers envoke this routine without the busy lock held and + * handle races via higher level locks. Typical callers should + * hold a busy lock to prevent invalidation. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; - VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } @@ -4886,11 +4964,22 @@ void vm_page_test_dirty(vm_page_t m) { - VM_OBJECT_ASSERT_WLOCKED(m->object); + vm_page_assert_busied(m); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } +void +vm_page_valid(vm_page_t m) +{ + + vm_page_assert_busied(m); + if (vm_page_xbusied(m)) + m->valid = VM_PAGE_BITS_ALL; + else + vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); +} + void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index cd98be020184..8e788c1baa53 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -92,31 +92,66 @@ * and sundry status bits. * * In general, operations on this structure's mutable fields are - * synchronized using either one of or a combination of the lock on the - * object that the page belongs to (O), the page lock (P), - * the per-domain lock for the free queues (F), or the page's queue - * lock (Q). The physical address of a page is used to select its page - * lock from a pool. The queue lock for a page depends on the value of - * its queue field and described in detail below. If a field is - * annotated below with two of these locks, then holding either lock is - * sufficient for read access, but both locks are required for write - * access. An annotation of (C) indicates that the field is immutable. - * An annotation of (A) indicates that modifications to the field must - * be atomic. Accesses to such fields may require additional - * synchronization depending on the context. + * synchronized using either one of or a combination of locks. If a + * field is annotated with two of these locks then holding either is + * sufficient for read access but both are required for write access. + * The physical address of a page is used to select its page lock from + * a pool. The queue lock for a page depends on the value of its queue + * field and is described in detail below. + * + * The following annotations are possible: + * (A) the field is atomic and may require additional synchronization. + * (B) the page busy lock. + * (C) the field is immutable. + * (F) the per-domain lock for the free queues + * (M) Machine dependent, defined by pmap layer. + * (O) the object that the page belongs to. + * (P) the page lock. + * (Q) the page's queue lock. + * + * The busy lock is an embedded reader-writer lock that protects the + * page's contents and identity (i.e., its tuple) as + * well as certain valid/dirty modifications. To avoid bloating the + * the page structure, the busy lock lacks some of the features available + * the kernel's general-purpose synchronization primitives. As a result, + * busy lock ordering rules are not verified, lock recursion is not + * detected, and an attempt to xbusy a busy page or sbusy an xbusy page + * results will trigger a panic rather than causing the thread to block. + * vm_page_sleep_if_busy() can be used to sleep until the page's busy + * state changes, after which the caller must re-lookup the page and + * re-evaluate its state. vm_page_busy_acquire() will block until + * the lock is acquired. + * + * The valid field is protected by the page busy lock (B) and object + * lock (O). Transitions from invalid to valid are generally done + * via I/O or zero filling and do not require the object lock. + * These must be protected with the busy lock to prevent page-in or + * creation races. Page invalidation generally happens as a result + * of truncate or msync. When invalidated, pages must not be present + * in pmap and must hold the object lock to prevent concurrent + * speculative read-only mappings that do not require busy. I/O + * routines may check for validity without a lock if they are prepared + * to handle invalidation races with higher level locks (vnode) or are + * unconcerned with races so long as they hold a reference to prevent + * recycling. When a valid bit is set while holding a shared busy + * lock (A) atomic operations are used to protect against concurrent + * modification. * * In contrast, the synchronization of accesses to the page's - * dirty field is machine dependent (M). In the - * machine-independent layer, the lock on the object that the - * page belongs to must be held in order to operate on the field. - * However, the pmap layer is permitted to set all bits within - * the field without holding that lock. If the underlying - * architecture does not support atomic read-modify-write + * dirty field is a mix of machine dependent (M) and busy (B). In + * the machine-independent layer, the page busy must be held to + * operate on the field. However, the pmap layer is permitted to + * set all bits within the field without holding that lock. If the + * underlying architecture does not support atomic read-modify-write * operations on the field's type, then the machine-independent * layer uses a 32-bit atomic on the aligned 32-bit word that * contains the dirty field. In the machine-independent layer, * the implementation of read-modify-write operations on the - * field is encapsulated in vm_page_clear_dirty_mask(). + * field is encapsulated in vm_page_clear_dirty_mask(). An + * exclusive busy lock combined with pmap_remove_{write/all}() is the + * only way to ensure a page can not become dirty. I/O generally + * removes the page from pmap to ensure exclusive access and atomic + * writes. * * The ref_count field tracks references to the page. References that * prevent the page from being reclaimable are called wirings and are @@ -136,19 +171,6 @@ * The page daemon must therefore handle the possibility of a concurrent * free of the page. * - * The busy lock is an embedded reader-writer lock which protects the - * page's contents and identity (i.e., its tuple) and - * interlocks with the object lock (O). In particular, a page may be - * busied or unbusied only with the object write lock held. To avoid - * bloating the page structure, the busy lock lacks some of the - * features available to the kernel's general-purpose synchronization - * primitives. As a result, busy lock ordering rules are not verified, - * lock recursion is not detected, and an attempt to xbusy a busy page - * or sbusy an xbusy page results will trigger a panic rather than - * causing the thread to block. vm_page_sleep_if_busy() can be used to - * sleep until the page's busy state changes, after which the caller - * must re-lookup the page and re-evaluate its state. - * * The queue field is the index of the page queue containing the page, * or PQ_NONE if the page is not enqueued. The queue lock of a page is * the page queue lock corresponding to the page queue index, or the @@ -215,7 +237,7 @@ struct vm_page { uint16_t flags; /* page PG_* flags (P) */ uint8_t order; /* index of the buddy queue (F) */ uint8_t pool; /* vm_phys freepool index (F) */ - uint8_t aflags; /* access is atomic */ + uint8_t aflags; /* atomic flags (A) */ uint8_t oflags; /* page VPO_* flags (O) */ uint8_t queue; /* page queue index (Q) */ int8_t psind; /* pagesizes[] index (O) */ @@ -223,8 +245,8 @@ struct vm_page { u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ - vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ - vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */ + vm_page_bits_t valid; /* valid DEV_BSIZE chunk map (O,B) */ + vm_page_bits_t dirty; /* dirty DEV_BSIZE chunk map (M,B) */ }; /* @@ -579,6 +601,7 @@ bool vm_page_free_prep(vm_page_t m); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); +void vm_page_invalid(vm_page_t m); void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); @@ -625,10 +648,11 @@ void vm_page_wire(vm_page_t); bool vm_page_wire_mapped(vm_page_t m); void vm_page_xunbusy_hard(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); -void vm_page_clear_dirty (vm_page_t, int, int); -void vm_page_set_invalid (vm_page_t, int, int); -int vm_page_is_valid (vm_page_t, int, int); -void vm_page_test_dirty (vm_page_t); +void vm_page_clear_dirty(vm_page_t, int, int); +void vm_page_set_invalid(vm_page_t, int, int); +void vm_page_valid(vm_page_t m); +int vm_page_is_valid(vm_page_t, int, int); +void vm_page_test_dirty(vm_page_t); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); @@ -643,6 +667,11 @@ void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line); void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #endif +#define vm_page_assert_busied(m) \ + KASSERT(vm_page_busied(m), \ + ("vm_page_assert_busied: page %p not busy @ %s:%d", \ + (m), __FILE__, __LINE__)) + #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ @@ -928,5 +957,19 @@ vm_page_wired(vm_page_t m) return (VPRC_WIRE_COUNT(m->ref_count) > 0); } +static inline bool +vm_page_all_valid(vm_page_t m) +{ + + return (m->valid == VM_PAGE_BITS_ALL); +} + +static inline bool +vm_page_none_valid(vm_page_t m) +{ + + return (m->valid == 0); +} + #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 2ad647c96562..7e638821cd34 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -469,7 +469,7 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, * edge case with file fragments. */ for (i = 0; i < count; i++) { - KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, + KASSERT(vm_page_all_valid(mc[i]), ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, @@ -829,7 +829,7 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ - if (m->valid == 0) + if (vm_page_none_valid(m)) goto free_page; /* @@ -1560,7 +1560,7 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ - if (m->valid == 0) + if (vm_page_none_valid(m)) goto free_page; /* diff --git a/sys/vm/vm_swapout.c b/sys/vm/vm_swapout.c index ada881018f32..28a6a4d91afe 100644 --- a/sys/vm/vm_swapout.c +++ b/sys/vm/vm_swapout.c @@ -582,14 +582,14 @@ vm_thread_swapin(struct thread *td, int oom_alloc) pages); for (i = 0; i < pages;) { vm_page_assert_xbusied(ma[i]); - if (ma[i]->valid == VM_PAGE_BITS_ALL) { + if (vm_page_all_valid(ma[i])) { vm_page_xunbusy(ma[i]); i++; continue; } vm_object_pip_add(ksobj, 1); for (j = i + 1; j < pages; j++) - if (ma[j]->valid == VM_PAGE_BITS_ALL) + if (vm_page_all_valid(ma[j])) break; rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a); KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i])); diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index cd12522fd617..f19c3a0af3f0 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -471,9 +471,12 @@ vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ - if ((nsize & PAGE_MASK) && - (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && - m->valid != 0) { + if (!(nsize & PAGE_MASK)) + goto out; + m = vm_page_grab(object, OFF_TO_IDX(nsize), VM_ALLOC_NOCREAT); + if (m == NULL) + goto out; + if (!vm_page_none_valid(m)) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; @@ -506,7 +509,9 @@ vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); } + vm_page_xunbusy(m); } +out: object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_WUNLOCK(object); @@ -701,7 +706,7 @@ vnode_pager_input_old(vm_object_t object, vm_page_t m) } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) - m->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m); return error ? VM_PAGER_ERROR : VM_PAGER_OK; } @@ -810,7 +815,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ - if (m[count - 1]->valid != 0 && --count == 0) { + if (!vm_page_none_valid(m[count - 1]) && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); @@ -870,7 +875,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); - m[0]->valid = VM_PAGE_BITS_ALL; + vm_page_valid(m[0]); VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } @@ -1136,7 +1141,7 @@ vnode_pager_generic_getpages_done(struct buf *bp) /* * Read filled up entire page. */ - mt->valid = VM_PAGE_BITS_ALL; + vm_page_valid(mt); KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt),