Introduce a new mechanism for relocating virtual pages to a new physical

address and use this mechanism when: 1. kmem_alloc_{attr,contig}() can't find suitable free pages in the physical memory allocator's free page lists. This replaces the long-standing approach of scanning the inactive and inactive queues, converting clean pages into PG_CACHED pages and laundering dirty pages. In contrast, the new mechanism does not use PG_CACHED pages nor does it trigger a large number of I/O operations. 2. on 32-bit MIPS processors, uma_small_alloc() and the pmap can't find free pages in the physical memory allocator's free page lists that are covered by the direct map. Tested by: adrian 3. ttm_bo_global_init() and ttm_vm_page_alloc_dma32() can't find suitable free pages in the physical memory allocator's free page lists. In the coming months, I expect that this new mechanism will be applied in other places. For example, balloon drivers should use relocation to minimize fragmentation of the guest physical address space. Make vm_phys_alloc_contig() a little smarter (and more efficient in some cases). Specifically, use vm_phys_segs[] earlier to avoid scanning free page lists that can't possibly contain suitable pages. Reviewed by: kib, markj Glanced at: jhb Discussed with: jeff Sponsored by: EMC / Isilon Storage Division Differential Revision: https://reviews.freebsd.org/D4444
svn path=/head/; revision=292469
2024-12-15 10:17:20 +00:00 · 2015-12-19 18:42:50 +00:00 · 2015-12-19 18:42:50 +00:00 · c869e67208 · 2020-12-20 02:59:44 +00:00
commit c869e67208
parent 2906f6cbae
14 changed files with 846 additions and 266 deletions
--- a/sys/dev/drm2/ttm/ttm_bo.c
+++ b/sys/dev/drm2/ttm/ttm_bo.c
@ -1488,21 +1488,21 @@ int ttm_bo_global_init(struct drm_global_reference *ref)
 	struct ttm_bo_global_ref *bo_ref =
 		container_of(ref, struct ttm_bo_global_ref, ref);
 	struct ttm_bo_global *glob = ref->object;
-	int ret;
+	int req, ret;
 	int tries;

 	sx_init(&glob->device_list_mutex, "ttmdlm");
 	mtx_init(&glob->lru_lock, "ttmlru", NULL, MTX_DEF);
 	glob->mem_glob = bo_ref->mem_glob;
+	req = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ;
 	tries = 0;
 retry:
-	glob->dummy_read_page = vm_page_alloc_contig(NULL, 0,
-	    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ,
+	glob->dummy_read_page = vm_page_alloc_contig(NULL, 0, req,
 	    1, 0, VM_MAX_ADDRESS, PAGE_SIZE, 0, VM_MEMATTR_UNCACHEABLE);

 	if (unlikely(glob->dummy_read_page == NULL)) {
-		if (tries < 1) {
-			vm_pageout_grow_cache(tries, 0, VM_MAX_ADDRESS);
+		if (tries < 1 && vm_page_reclaim_contig(req, 1,
+		    0, VM_MAX_ADDRESS, PAGE_SIZE, 0)) {
 			tries++;
 			goto retry;
 		}
--- a/sys/dev/drm2/ttm/ttm_page_alloc.c
+++ b/sys/dev/drm2/ttm/ttm_page_alloc.c
@ -166,13 +166,9 @@ ttm_vm_page_alloc_dma32(int req, vm_memattr_t memattr)
 		    PAGE_SIZE, 0, memattr);
 		if (p != NULL || tries > 2)
 			return (p);
-
-		/*
-		 * Before growing the cache see if this is just a normal
-		 * memory shortage.
-		 */
-		VM_WAIT;
-		vm_pageout_grow_cache(tries, 0, 0xffffffff);
+		if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff,
+		    PAGE_SIZE, 0))
+			VM_WAIT;
 	}
 }

--- a/sys/mips/include/pmap.h
+++ b/sys/mips/include/pmap.h
@ -178,7 +178,6 @@ void *pmap_kenter_temporary(vm_paddr_t pa, int i);
 void pmap_kenter_temporary_free(vm_paddr_t pa);
 void pmap_flush_pvcache(vm_page_t m);
 int pmap_emulate_modified(pmap_t pmap, vm_offset_t va);
-void pmap_grow_direct_page_cache(void);
 void pmap_page_set_memattr(vm_page_t, vm_memattr_t);

 #endif				/* _KERNEL */
--- a/sys/mips/mips/pmap.c
+++ b/sys/mips/mips/pmap.c
@ -166,6 +166,7 @@ static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 static vm_page_t pmap_alloc_direct_page(unsigned int index, int req);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
+static void pmap_grow_direct_page(int req);
 static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
    pd_entry_t pde);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
@ -1040,14 +1041,16 @@ pmap_pinit0(pmap_t pmap)
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }

-void
-pmap_grow_direct_page_cache()
+static void
+pmap_grow_direct_page(int req)
 {

 #ifdef __mips_n64
 	VM_WAIT;
 #else
-	vm_pageout_grow_cache(3, 0, MIPS_KSEG0_LARGEST_PHYS);
+	if (!vm_page_reclaim_contig(req, 1, 0, MIPS_KSEG0_LARGEST_PHYS,
+	    PAGE_SIZE, 0))
+		VM_WAIT;
 #endif
 }

@ -1077,13 +1080,15 @@ pmap_pinit(pmap_t pmap)
 {
 	vm_offset_t ptdva;
 	vm_page_t ptdpg;
-	int i;
+	int i, req_class;

 	/*
 	 * allocate the page directory page
 	 */
-	while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, VM_ALLOC_NORMAL)) == NULL)
-	       pmap_grow_direct_page_cache();
+	req_class = VM_ALLOC_NORMAL;
+	while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, req_class)) ==
+	    NULL)
+		pmap_grow_direct_page(req_class);

 	ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg));
 	pmap->pm_segtab = (pd_entry_t *)ptdva;
@ -1107,15 +1112,17 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags)
 {
 	vm_offset_t pageva;
 	vm_page_t m;
+	int req_class;

 	/*
 	 * Find or fabricate a new pagetable page
 	 */
-	if ((m = pmap_alloc_direct_page(ptepindex, VM_ALLOC_NORMAL)) == NULL) {
+	req_class = VM_ALLOC_NORMAL;
+	if ((m = pmap_alloc_direct_page(ptepindex, req_class)) == NULL) {
 		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
 			PMAP_UNLOCK(pmap);
 			rw_wunlock(&pvh_global_lock);
-			pmap_grow_direct_page_cache();
+			pmap_grow_direct_page(req_class);
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
@ -1241,9 +1248,10 @@ pmap_growkernel(vm_offset_t addr)
 	vm_page_t nkpg;
 	pd_entry_t *pde, *pdpe;
 	pt_entry_t *pte;
-	int i;
+	int i, req_class;

 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
+	req_class = VM_ALLOC_INTERRUPT;
 	addr = roundup2(addr, NBSEG);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
@ -1252,7 +1260,7 @@ pmap_growkernel(vm_offset_t addr)
 #ifdef __mips_n64
 		if (*pdpe == 0) {
 			/* new intermediate page table entry */
-			nkpg = pmap_alloc_direct_page(nkpt, VM_ALLOC_INTERRUPT);
+			nkpg = pmap_alloc_direct_page(nkpt, req_class);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			*pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg));
@ -1272,8 +1280,13 @@ pmap_growkernel(vm_offset_t addr)
 		/*
 		 * This index is bogus, but out of the way
 		 */
-		nkpg = pmap_alloc_direct_page(nkpt, VM_ALLOC_INTERRUPT);
-		if (!nkpg)
+		nkpg = pmap_alloc_direct_page(nkpt, req_class);
+#ifndef __mips_n64
+		if (nkpg == NULL && vm_page_reclaim_contig(req_class, 1,
+		    0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0))
+			nkpg = pmap_alloc_direct_page(nkpt, req_class);
+#endif
+		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		nkpt++;
 		*pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg));
--- a/sys/mips/mips/uma_machdep.c
+++ b/sys/mips/mips/uma_machdep.c
@ -53,11 +53,16 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)

 	for (;;) {
 		m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags);
+#ifndef __mips_n64
+		if (m == NULL && vm_page_reclaim_contig(pflags, 1,
+		    0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0))
+			continue;
+#endif
 		if (m == NULL) {
 			if (wait & M_NOWAIT)
 				return (NULL);
 			else
-				pmap_grow_direct_page_cache();
+				VM_WAIT;
 		} else
 			break;
 	}
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@ -181,7 +181,10 @@ kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low,
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
-				vm_pageout_grow_cache(tries, low, high);
+				if (!vm_page_reclaim_contig(pflags, 1,
+				    low, high, PAGE_SIZE, 0) &&
+				    (flags & M_WAITOK) != 0)
+					VM_WAIT;
 				VM_OBJECT_WLOCK(object);
 				tries++;
 				goto retry;
@ -217,6 +220,7 @@ kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
 	vm_offset_t addr, tmp;
 	vm_ooffset_t offset;
 	vm_page_t end_m, m;
+	u_long npages;
 	int pflags, tries;
 
 	size = round_page(size);
@ -224,15 +228,18 @@ kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	npages = atop(size);
 	VM_OBJECT_WLOCK(object);
 	tries = 0;
 retry:
 	m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
-	    atop(size), low, high, alignment, boundary, memattr);
+	    npages, low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
-			vm_pageout_grow_cache(tries, low, high);
+			if (!vm_page_reclaim_contig(pflags, npages, low, high,
+			    alignment, boundary) && (flags & M_WAITOK) != 0)
+				VM_WAIT;
 			VM_OBJECT_WLOCK(object);
 			tries++;
 			goto retry;
@ -240,7 +247,7 @@ kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
 		vmem_free(vmem, addr, size);
 		return (0);
 	}
-	end_m = m + atop(size);
+	end_m = m + npages;
 	tmp = addr;
 	for (; m < end_m; m++) {
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -158,11 +158,14 @@ static struct vnode *vm_page_alloc_init(vm_page_t m);
 static void vm_page_cache_turn_free(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_free_wakeup(void);
 static void vm_page_init_fakepg(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
    vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
    vm_page_t mpred);
+static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+    vm_paddr_t high);

 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);

@ -2093,6 +2096,592 @@ vm_page_alloc_freelist(int flind, int req)
 	return (m);
 }

+#define	VPSC_ANY	0	/* No restrictions. */
+#define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
+#define	VPSC_NOSUPER	2	/* Skip superpages. */
+
+/*
+ *	vm_page_scan_contig:
+ *
+ *	Scan vm_page_array[] between the specified entries "m_start" and
+ *	"m_end" for a run of contiguous physical pages that satisfy the
+ *	specified conditions, and return the lowest page in the run.  The
+ *	specified "alignment" determines the alignment of the lowest physical
+ *	page in the run.  If the specified "boundary" is non-zero, then the
+ *	run of physical pages cannot span a physical address that is a
+ *	multiple of "boundary".
+ *
+ *	"m_end" is never dereferenced, so it need not point to a vm_page
+ *	structure within vm_page_array[].
+ *
+ *	"npages" must be greater than zero.  "m_start" and "m_end" must not
+ *	span a hole (or discontiguity) in the physical address space.  Both
+ *	"alignment" and "boundary" must be a power of two.
+ */
+vm_page_t
+vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
+    u_long alignment, vm_paddr_t boundary, int options)
+{
+	struct mtx *m_mtx, *new_mtx;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_run;
+#if VM_NRESERVLEVEL > 0
+	int level;
+#endif
+	int m_inc, order, run_ext, run_len;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	m_run = NULL;
+	run_len = 0;
+	m_mtx = NULL;
+	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
+		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
+		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
+
+		/*
+		 * If the current page would be the start of a run, check its
+		 * physical address against the end, alignment, and boundary
+		 * conditions.  If it doesn't satisfy these conditions, either
+		 * terminate the scan or advance to the next page that
+		 * satisfies the failed condition.
+		 */
+		if (run_len == 0) {
+			KASSERT(m_run == NULL, ("m_run != NULL"));
+			if (m + npages > m_end)
+				break;
+			pa = VM_PAGE_TO_PHYS(m);
+			if ((pa & (alignment - 1)) != 0) {
+				m_inc = atop(roundup2(pa, alignment) - pa);
+				continue;
+			}
+			if (((pa ^ (pa + ptoa(npages) - 1)) & ~(boundary -
+			    1)) != 0) {
+				m_inc = atop(roundup2(pa, boundary) - pa);
+				continue;
+			}
+		} else
+			KASSERT(m_run != NULL, ("m_run == NULL"));
+
+		/*
+		 * Avoid releasing and reacquiring the same page lock.
+		 */
+		new_mtx = vm_page_lockptr(m);
+		if (m_mtx != new_mtx) {
+			if (m_mtx != NULL)
+				mtx_unlock(m_mtx);
+			m_mtx = new_mtx;
+			mtx_lock(m_mtx);
+		}
+		m_inc = 1;
+retry:
+		if (m->wire_count != 0 || m->hold_count != 0)
+			run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+		else if ((level = vm_reserv_level(m)) >= 0 &&
+		    (options & VPSC_NORESERV) != 0) {
+			run_ext = 0;
+			/* Advance to the end of the reservation. */
+			pa = VM_PAGE_TO_PHYS(m);
+			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
+			    pa);
+		}
+#endif
+		else if ((object = m->object) != NULL) {
+			/*
+			 * The page is considered eligible for relocation if
+			 * and only if it could be laundered or reclaimed by
+			 * the page daemon.
+			 */
+			if (!VM_OBJECT_TRYRLOCK(object)) {
+				mtx_unlock(m_mtx);
+				VM_OBJECT_RLOCK(object);
+				mtx_lock(m_mtx);
+				if (m->object != object) {
+					/*
+					 * The page may have been freed.
+					 */
+					VM_OBJECT_RUNLOCK(object);
+					goto retry;
+				} else if (m->wire_count != 0 ||
+				    m->hold_count != 0) {
+					run_ext = 0;
+					goto unlock;
+				}
+			}
+			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+			    ("page %p is PG_UNHOLDFREE", m));
+			/* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */
+			if (object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    object->type != OBJT_VNODE)
+				run_ext = 0;
+			else if ((m->flags & PG_CACHED) != 0 ||
+			    m != vm_page_lookup(object, m->pindex)) {
+				/*
+				 * The page is cached or recently converted
+				 * from cached to free.
+				 */
+#if VM_NRESERVLEVEL > 0
+				if (level >= 0) {
+					/*
+					 * The page is reserved.  Extend the
+					 * current run by one page.
+					 */
+					run_ext = 1;
+				} else
+#endif
+				if ((order = m->order) < VM_NFREEORDER) {
+					/*
+					 * The page is enqueued in the
+					 * physical memory allocator's cache/
+					 * free page queues.  Moreover, it is
+					 * the first page in a power-of-two-
+					 * sized run of contiguous cache/free
+					 * pages.  Add these pages to the end
+					 * of the current run, and jump
+					 * ahead.
+					 */
+					run_ext = 1 << order;
+					m_inc = 1 << order;
+				} else
+					run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+			} else if ((options & VPSC_NOSUPER) != 0 &&
+			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
+				run_ext = 0;
+				/* Advance to the end of the superpage. */
+				pa = VM_PAGE_TO_PHYS(m);
+				m_inc = atop(roundup2(pa + 1,
+				    vm_reserv_size(level)) - pa);
+#endif
+			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
+			    m->queue != PQ_NONE && !vm_page_busied(m)) {
+				/*
+				 * The page is allocated but eligible for
+				 * relocation.  Extend the current run by one
+				 * page.
+				 */
+				KASSERT(pmap_page_get_memattr(m) ==
+				    VM_MEMATTR_DEFAULT,
+				    ("page %p has an unexpected memattr", m));
+				KASSERT((m->oflags & (VPO_SWAPINPROG |
+				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+				    ("page %p has unexpected oflags", m));
+				/* Don't care: VPO_NOSYNC. */
+				run_ext = 1;
+			} else
+				run_ext = 0;
+unlock:
+			VM_OBJECT_RUNLOCK(object);
+#if VM_NRESERVLEVEL > 0
+		} else if (level >= 0) {
+			/*
+			 * The page is reserved but not yet allocated.  In
+			 * other words, it is still cached or free.  Extend
+			 * the current run by one page.
+			 */
+			run_ext = 1;
+#endif
+		} else if ((order = m->order) < VM_NFREEORDER) {
+			/*
+			 * The page is enqueued in the physical memory
+			 * allocator's cache/free page queues.  Moreover, it
+			 * is the first page in a power-of-two-sized run of
+			 * contiguous cache/free pages.  Add these pages to
+			 * the end of the current run, and jump ahead.
+			 */
+			run_ext = 1 << order;
+			m_inc = 1 << order;
+		} else {
+			/*
+			 * Skip the page for one of the following reasons: (1)
+			 * It is enqueued in the physical memory allocator's
+			 * cache/free page queues.  However, it is not the
+			 * first page in a run of contiguous cache/free pages.
+			 * (This case rarely occurs because the scan is
+			 * performed in ascending order.) (2) It is not
+			 * reserved, and it is transitioning from free to
+			 * allocated.  (Conversely, the transition from
+			 * allocated to free for managed pages is blocked by
+			 * the page lock.) (3) It is allocated but not
+			 * contained by an object and not wired, e.g.,
+			 * allocated by Xen's balloon driver.
+			 */
+			run_ext = 0;
+		}
+
+		/*
+		 * Extend or reset the current run of pages.
+		 */
+		if (run_ext > 0) {
+			if (run_len == 0)
+				m_run = m;
+			run_len += run_ext;
+		} else {
+			if (run_len > 0) {
+				m_run = NULL;
+				run_len = 0;
+			}
+		}
+	}
+	if (m_mtx != NULL)
+		mtx_unlock(m_mtx);
+	if (run_len >= npages)
+		return (m_run);
+	return (NULL);
+}
+
+/*
+ *	vm_page_reclaim_run:
+ *
+ *	Try to relocate each of the allocated virtual pages within the
+ *	specified run of physical pages to a new physical address.  Free the
+ *	physical pages underlying the relocated virtual pages.  A virtual page
+ *	is relocatable if and only if it could be laundered or reclaimed by
+ *	the page daemon.  Whenever possible, a virtual page is relocated to a
+ *	physical address above "high".
+ *
+ *	Returns 0 if every physical page within the run was already free or
+ *	just freed by a successful relocation.  Otherwise, returns a non-zero
+ *	value indicating why the last attempt to relocate a virtual page was
+ *	unsuccessful.
+ *
+ *	"req_class" must be an allocation class.
+ */
+static int
+vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+    vm_paddr_t high)
+{
+	struct mtx *m_mtx, *new_mtx;
+	struct spglist free;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_end, m_new;
+	int error, order, req;
+
+	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
+	    ("req_class is not an allocation class"));
+	SLIST_INIT(&free);
+	error = 0;
+	m = m_run;
+	m_end = m_run + npages;
+	m_mtx = NULL;
+	for (; error == 0 && m < m_end; m++) {
+		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
+		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
+
+		/*
+		 * Avoid releasing and reacquiring the same page lock.
+		 */
+		new_mtx = vm_page_lockptr(m);
+		if (m_mtx != new_mtx) {
+			if (m_mtx != NULL)
+				mtx_unlock(m_mtx);
+			m_mtx = new_mtx;
+			mtx_lock(m_mtx);
+		}
+retry:
+		if (m->wire_count != 0 || m->hold_count != 0)
+			error = EBUSY;
+		else if ((object = m->object) != NULL) {
+			/*
+			 * The page is relocated if and only if it could be
+			 * laundered or reclaimed by the page daemon.
+			 */
+			if (!VM_OBJECT_TRYWLOCK(object)) {
+				mtx_unlock(m_mtx);
+				VM_OBJECT_WLOCK(object);
+				mtx_lock(m_mtx);
+				if (m->object != object) {
+					/*
+					 * The page may have been freed.
+					 */
+					VM_OBJECT_WUNLOCK(object);
+					goto retry;
+				} else if (m->wire_count != 0 ||
+				    m->hold_count != 0) {
+					error = EBUSY;
+					goto unlock;
+				}
+			}
+			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+			    ("page %p is PG_UNHOLDFREE", m));
+			/* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */
+			if (object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    object->type != OBJT_VNODE)
+				error = EINVAL;
+			else if ((m->flags & PG_CACHED) != 0 ||
+			    m != vm_page_lookup(object, m->pindex)) {
+				/*
+				 * The page is cached or recently converted
+				 * from cached to free.
+				 */
+				VM_OBJECT_WUNLOCK(object);
+				goto cached;
+			} else if (object->memattr != VM_MEMATTR_DEFAULT)
+				error = EINVAL;
+			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
+				KASSERT(pmap_page_get_memattr(m) ==
+				    VM_MEMATTR_DEFAULT,
+				    ("page %p has an unexpected memattr", m));
+				KASSERT((m->oflags & (VPO_SWAPINPROG |
+				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+				    ("page %p has unexpected oflags", m));
+				/* Don't care: VPO_NOSYNC. */
+				if (m->valid != 0) {
+					/*
+					 * First, try to allocate a new page
+					 * that is above "high".  Failing
+					 * that, try to allocate a new page
+					 * that is below "m_run".  Allocate
+					 * the new page between the end of
+					 * "m_run" and "high" only as a last
+					 * resort.
+					 */
+					req = req_class | VM_ALLOC_NOOBJ;
+					if ((m->flags & PG_NODUMP) != 0)
+						req |= VM_ALLOC_NODUMP;
+					if (trunc_page(high) !=
+					    ~(vm_paddr_t)PAGE_MASK) {
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    round_page(high),
+						    ~(vm_paddr_t)0,
+						    PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					} else
+						m_new = NULL;
+					if (m_new == NULL) {
+						pa = VM_PAGE_TO_PHYS(m_run);
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    0, pa - 1, PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					}
+					if (m_new == NULL) {
+						pa += ptoa(npages);
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    pa, high, PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					}
+					if (m_new == NULL) {
+						error = ENOMEM;
+						goto unlock;
+					}
+					KASSERT(m_new->wire_count == 0,
+					    ("page %p is wired", m));
+
+					/*
+					 * Replace "m" with the new page.  For
+					 * vm_page_replace(), "m" must be busy
+					 * and dequeued.  Finally, change "m"
+					 * as if vm_page_free() was called.
+					 */
+					if (object->ref_count != 0)
+						pmap_remove_all(m);
+					m_new->aflags = m->aflags;
+					KASSERT(m_new->oflags == VPO_UNMANAGED,
+					    ("page %p is managed", m));
+					m_new->oflags = m->oflags & VPO_NOSYNC;
+					pmap_copy_page(m, m_new);
+					m_new->valid = m->valid;
+					m_new->dirty = m->dirty;
+					m->flags &= ~PG_ZERO;
+					vm_page_xbusy(m);
+					vm_page_remque(m);
+					vm_page_replace_checked(m_new, object,
+					    m->pindex, m);
+					m->valid = 0;
+					vm_page_undirty(m);
+
+					/*
+					 * The new page must be deactivated
+					 * before the object is unlocked.
+					 */
+					new_mtx = vm_page_lockptr(m_new);
+					if (m_mtx != new_mtx) {
+						mtx_unlock(m_mtx);
+						m_mtx = new_mtx;
+						mtx_lock(m_mtx);
+					}
+					vm_page_deactivate(m_new);
+				} else {
+					m->flags &= ~PG_ZERO;
+					vm_page_remque(m);
+					vm_page_remove(m);
+					KASSERT(m->dirty == 0,
+					    ("page %p is dirty", m));
+				}
+				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
+			} else
+				error = EBUSY;
+unlock:
+			VM_OBJECT_WUNLOCK(object);
+		} else {
+cached:
+			mtx_lock(&vm_page_queue_free_mtx);
+			order = m->order;
+			if (order < VM_NFREEORDER) {
+				/*
+				 * The page is enqueued in the physical memory
+				 * allocator's cache/free page queues.
+				 * Moreover, it is the first page in a power-
+				 * of-two-sized run of contiguous cache/free
+				 * pages.  Jump ahead to the last page within
+				 * that run, and continue from there.
+				 */
+				m += (1 << order) - 1;
+			}
+#if VM_NRESERVLEVEL > 0
+			else if (vm_reserv_is_page_free(m))
+				order = 0;
+#endif
+			mtx_unlock(&vm_page_queue_free_mtx);
+			if (order == VM_NFREEORDER)
+				error = EINVAL;
+		}
+	}
+	if (m_mtx != NULL)
+		mtx_unlock(m_mtx);
+	if ((m = SLIST_FIRST(&free)) != NULL) {
+		mtx_lock(&vm_page_queue_free_mtx);
+		do {
+			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
+			vm_phys_freecnt_adj(m, 1);
+#if VM_NRESERVLEVEL > 0
+			if (!vm_reserv_free_page(m))
+#else
+			if (true)
+#endif
+				vm_phys_free_pages(m, 0);
+		} while ((m = SLIST_FIRST(&free)) != NULL);
+		vm_page_zero_idle_wakeup();
+		vm_page_free_wakeup();
+		mtx_unlock(&vm_page_queue_free_mtx);
+	}
+	return (error);
+}
+
+#define	NRUNS	16
+
+CTASSERT(powerof2(NRUNS));
+
+#define	RUN_INDEX(count)	((count) & (NRUNS - 1))
+
+#define	MIN_RECLAIM	8
+
+/*
+ *	vm_page_reclaim_contig:
+ *
+ *	Reclaim allocated, contiguous physical memory satisfying the specified
+ *	conditions by relocating the virtual pages using that physical memory.
+ *	Returns true if reclamation is successful and false otherwise.  Since
+ *	relocation requires the allocation of physical pages, reclamation may
+ *	fail due to a shortage of cache/free pages.  When reclamation fails,
+ *	callers are expected to perform VM_WAIT before retrying a failed
+ *	allocation operation, e.g., vm_page_alloc_contig().
+ *
+ *	The caller must always specify an allocation class through "req".
+ *
+ *	allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs a page
+ *	VM_ALLOC_INTERRUPT	interrupt time request
+ *
+ *	The optional allocation flags are ignored.
+ *
+ *	"npages" must be greater than zero.  Both "alignment" and "boundary"
+ *	must be a power of two.
+ */
+bool
+vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary)
+{
+	vm_paddr_t curr_low;
+	vm_page_t m_run, m_runs[NRUNS];
+	u_long count, reclaimed;
+	int error, i, options, req_class;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	req_class = req & VM_ALLOC_CLASS_MASK;
+
+	/*
+	 * The page daemon is allowed to dig deeper into the free page list.
+	 */
+	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+		req_class = VM_ALLOC_SYSTEM;
+
+	/*
+	 * Return if the number of cached and free pages cannot satisfy the
+	 * requested allocation.
+	 */
+	count = vm_cnt.v_free_count + vm_cnt.v_cache_count;
+	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
+	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
+	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
+		return (false);
+
+	/*
+	 * Scan up to three times, relaxing the restrictions ("options") on
+	 * the reclamation of reservations and superpages each time.
+	 */
+	for (options = VPSC_NORESERV;;) {
+		/*
+		 * Find the highest runs that satisfy the given constraints
+		 * and restrictions, and record them in "m_runs".
+		 */
+		curr_low = low;
+		count = 0;
+		for (;;) {
+			m_run = vm_phys_scan_contig(npages, curr_low, high,
+			    alignment, boundary, options);
+			if (m_run == NULL)
+				break;
+			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
+			m_runs[RUN_INDEX(count)] = m_run;
+			count++;
+		}
+
+		/*
+		 * Reclaim the highest runs in LIFO (descending) order until
+		 * the number of reclaimed pages, "reclaimed", is at least
+		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
+		 * reclamation is idempotent, and runs will (likely) recur
+		 * from one scan to the next as restrictions are relaxed.
+		 */
+		reclaimed = 0;
+		for (i = 0; count > 0 && i < NRUNS; i++) {
+			count--;
+			m_run = m_runs[RUN_INDEX(count)];
+			error = vm_page_reclaim_run(req_class, npages, m_run,
+			    high);
+			if (error == 0) {
+				reclaimed += npages;
+				if (reclaimed >= MIN_RECLAIM)
+					return (true);
+			}
+		}
+
+		/*
+		 * Either relax the restrictions on the next scan or return if
+		 * the last scan had no restrictions.
+		 */
+		if (options == VPSC_NORESERV)
+			options = VPSC_NOSUPER;
+		else if (options == VPSC_NOSUPER)
+			options = VPSC_ANY;
+		else if (options == VPSC_ANY)
+			return (reclaimed != 0);
+	}
+}
+
 /*
 *	vm_wait:	(also see VM_WAIT macro)
 *
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -474,6 +474,8 @@ vm_page_t vm_page_prev(vm_page_t m);
 boolean_t vm_page_ps_is_valid(vm_page_t m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
+bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
+    vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 void vm_page_reference(vm_page_t m);
 void vm_page_remove (vm_page_t);
 int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
@ -482,6 +484,8 @@ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object,
 void vm_page_requeue(vm_page_t m);
 void vm_page_requeue_locked(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);
+vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
+    vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
 void vm_page_set_valid_range(vm_page_t m, int base, int size);
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -237,8 +237,6 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");

 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
-    vm_paddr_t);
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
@ -595,170 +593,6 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
 	return (numpagedout);
 }

-static boolean_t
-vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
-    vm_paddr_t high)
-{
-	struct mount *mp;
-	struct vnode *vp;
-	vm_object_t object;
-	vm_paddr_t pa;
-	vm_page_t m, m_tmp, next;
-	int lockmode;
-
-	vm_pagequeue_lock(pq);
-	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
-		if ((m->flags & PG_MARKER) != 0)
-			continue;
-		pa = VM_PAGE_TO_PHYS(m);
-		if (pa < low || pa + PAGE_SIZE > high)
-			continue;
-		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
-			vm_page_unlock(m);
-			continue;
-		}
-		object = m->object;
-		if ((!VM_OBJECT_TRYWLOCK(object) &&
-		    (!vm_pageout_fallback_object_lock(m, &next) ||
-		    m->hold_count != 0)) || vm_page_busied(m)) {
-			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-			continue;
-		}
-		vm_page_test_dirty(m);
-		if (m->dirty == 0 && object->ref_count != 0)
-			pmap_remove_all(m);
-		if (m->dirty != 0) {
-			vm_page_unlock(m);
-			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
-				VM_OBJECT_WUNLOCK(object);
-				continue;
-			}
-			if (object->type == OBJT_VNODE) {
-				vm_pagequeue_unlock(pq);
-				vp = object->handle;
-				vm_object_reference_locked(object);
-				VM_OBJECT_WUNLOCK(object);
-				(void)vn_start_write(vp, &mp, V_WAIT);
-				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
-				    LK_SHARED : LK_EXCLUSIVE;
-				vn_lock(vp, lockmode | LK_RETRY);
-				VM_OBJECT_WLOCK(object);
-				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
-				VM_OBJECT_WUNLOCK(object);
-				VOP_UNLOCK(vp, 0);
-				vm_object_deallocate(object);
-				vn_finished_write(mp);
-				return (TRUE);
-			} else if (object->type == OBJT_SWAP ||
-			    object->type == OBJT_DEFAULT) {
-				vm_pagequeue_unlock(pq);
-				m_tmp = m;
-				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
-				    0, NULL, NULL);
-				VM_OBJECT_WUNLOCK(object);
-				return (TRUE);
-			}
-		} else {
-			/*
-			 * Dequeue here to prevent lock recursion in
-			 * vm_page_cache().
-			 */
-			vm_page_dequeue_locked(m);
-			vm_page_cache(m);
-			vm_page_unlock(m);
-		}
-		VM_OBJECT_WUNLOCK(object);
-	}
-	vm_pagequeue_unlock(pq);
-	return (FALSE);
-}
-
-/*
- * Increase the number of cached pages.  The specified value, "tries",
- * determines which categories of pages are cached:
- *
- *  0: All clean, inactive pages within the specified physical address range
- *     are cached.  Will not sleep.
- *  1: The vm_lowmem handlers are called.  All inactive pages within
- *     the specified physical address range are cached.  May sleep.
- *  2: The vm_lowmem handlers are called.  All inactive and active pages
- *     within the specified physical address range are cached.  May sleep.
- */
-void
-vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
-{
-	int actl, actmax, inactl, inactmax, dom, initial_dom;
-	static int start_dom = 0;
-
-	if (tries > 0) {
-		/*
-		 * Decrease registered cache sizes.  The vm_lowmem handlers
-		 * may acquire locks and/or sleep, so they can only be invoked
-		 * when "tries" is greater than zero.
-		 */
-		SDT_PROBE0(vm, , , vm__lowmem_cache);
-		EVENTHANDLER_INVOKE(vm_lowmem, 0);
-
-		/*
-		 * We do this explicitly after the caches have been drained
-		 * above.
-		 */
-		uma_reclaim();
-	}
-
-	/*
-	 * Make the next scan start on the next domain.
-	 */
-	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
-
-	inactl = 0;
-	inactmax = vm_cnt.v_inactive_count;
-	actl = 0;
-	actmax = tries < 2 ? 0 : vm_cnt.v_active_count;
-	dom = initial_dom;
-
-	/*
-	 * Scan domains in round-robin order, first inactive queues,
-	 * then active.  Since domain usually owns large physically
-	 * contiguous chunk of memory, it makes sense to completely
-	 * exhaust one domain before switching to next, while growing
-	 * the pool of contiguous physical pages.
-	 *
-	 * Do not even start launder a domain which cannot contain
-	 * the specified address range, as indicated by segments
-	 * constituting the domain.
-	 */
-again_inact:
-	if (inactl < inactmax) {
-		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
-		    low, high) &&
-		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
-		    tries, low, high)) {
-			inactl++;
-			goto again_inact;
-		}
-		if (++dom == vm_ndomains)
-			dom = 0;
-		if (dom != initial_dom)
-			goto again_inact;
-	}
-again_act:
-	if (actl < actmax) {
-		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
-		    low, high) &&
-		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
-		      tries, low, high)) {
-			actl++;
-			goto again_act;
-		}
-		if (++dom == vm_ndomains)
-			dom = 0;
-		if (dom != initial_dom)
-			goto again_act;
-	}
-}
-
 #if !defined(NO_SWAPPING)
 /*
 *	vm_pageout_object_deactivate_pages
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@ -101,7 +101,6 @@ extern void vm_waitpfault(void);

 #ifdef _KERNEL
 int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
-void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t);
 void vm_pageout_oom(int shortage);
 #endif
 #endif	/* _VM_VM_PAGEOUT_H_ */
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@ -170,6 +170,9 @@ static struct vm_domain_policy vm_default_policy =

 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
    int order);
+static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
+    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+    vm_paddr_t boundary);
 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
@ -1162,6 +1165,56 @@ vm_phys_free_contig(vm_page_t m, u_long npages)
 	}
 }

+/*
+ * Scan physical memory between the specified addresses "low" and "high" for a
+ * run of contiguous physical pages that satisfy the specified conditions, and
+ * return the lowest page in the run.  The specified "alignment" determines
+ * the alignment of the lowest physical page in the run.  If the specified
+ * "boundary" is non-zero, then the run of physical pages cannot span a
+ * physical address that is a multiple of "boundary".
+ *
+ * "npages" must be greater than zero.  Both "alignment" and "boundary" must
+ * be a power of two.
+ */
+vm_page_t
+vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary, int options)
+{
+	vm_paddr_t pa_end;
+	vm_page_t m_end, m_run, m_start;
+	struct vm_phys_seg *seg;
+	int segind;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	if (low >= high)
+		return (NULL);
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		if (seg->start >= high)
+			break;
+		if (low >= seg->end)
+			continue;
+		if (low <= seg->start)
+			m_start = seg->first_page;
+		else
+			m_start = &seg->first_page[atop(low - seg->start)];
+		if (high < seg->end)
+			pa_end = high;
+		else
+			pa_end = seg->end;
+		if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
+			continue;
+		m_end = &seg->first_page[atop(pa_end - seg->start)];
+		m_run = vm_page_scan_contig(npages, m_start, m_end,
+		    alignment, boundary, options);
+		if (m_run != NULL)
+			return (m_run);
+	}
+	return (NULL);
+}
+
 /*
 * Set the pool for a contiguous, power of two-sized set of physical pages. 
 */
@ -1300,93 +1353,123 @@ vm_page_t
 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
    u_long alignment, vm_paddr_t boundary)
 {
-	struct vm_freelist *fl;
-	struct vm_phys_seg *seg;
-	vm_paddr_t pa, pa_last, size;
-	vm_page_t m, m_ret;
-	u_long npages_end;
-	int domain, flind, oind, order, pind;
+	vm_paddr_t pa_end, pa_start;
+	vm_page_t m_run;
 	struct vm_domain_iterator vi;
+	struct vm_phys_seg *seg;
+	int domain, segind;

+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	size = npages << PAGE_SHIFT;
-	KASSERT(size != 0,
-	    ("vm_phys_alloc_contig: size must not be 0"));
-	KASSERT((alignment & (alignment - 1)) == 0,
-	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
-	KASSERT((boundary & (boundary - 1)) == 0,
-	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
-	/* Compute the queue that is the best fit for npages. */
-	for (order = 0; (1 << order) < npages; order++);
-
+	if (low >= high)
+		return (NULL);
 	vm_policy_iterator_init(&vi);
-
 restartdom:
 	if (vm_domain_iterator_run(&vi, &domain) != 0) {
 		vm_policy_iterator_finish(&vi);
 		return (NULL);
 	}
+	m_run = NULL;
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		if (seg->start >= high)
+			break;
+		if (low >= seg->end || seg->domain != domain)
+			continue;
+		if (low <= seg->start)
+			pa_start = seg->start;
+		else
+			pa_start = low;
+		if (high < seg->end)
+			pa_end = high;
+		else
+			pa_end = seg->end;
+		if (pa_end - pa_start < ptoa(npages))
+			continue;
+		m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
+		    alignment, boundary);
+		if (m_run != NULL)
+			break;
+	}
+	if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
+		goto restartdom;
+	vm_policy_iterator_finish(&vi);
+	return (m_run);
+}

-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
-			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-				fl = &vm_phys_free_queues[domain][flind][pind][0];
-				TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
-					/*
-					 * A free list may contain physical pages
-					 * from one or more segments.
-					 */
-					seg = &vm_phys_segs[m_ret->segind];
-					if (seg->start > high ||
-					    low >= seg->end)
-						continue;
+/*
+ * Allocate a run of contiguous physical pages from the free list for the
+ * specified segment.
+ */
+static vm_page_t
+vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+{
+	struct vm_freelist *fl;
+	vm_paddr_t pa, pa_end, size;
+	vm_page_t m, m_ret;
+	u_long npages_end;
+	int oind, order, pind;

+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	/* Compute the queue that is the best fit for npages. */
+	for (order = 0; (1 << order) < npages; order++);
+	/* Search for a run satisfying the specified conditions. */
+	size = npages << PAGE_SHIFT;
+	for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
+	    oind++) {
+		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+			fl = (*seg->free_queues)[pind];
+			TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+				/*
+				 * Is the size of this allocation request
+				 * larger than the largest block size?
+				 */
+				if (order >= VM_NFREEORDER) {
 					/*
-					 * Is the size of this allocation request
-					 * larger than the largest block size?
-					 */
-					if (order >= VM_NFREEORDER) {
-						/*
-						 * Determine if a sufficient number
-						 * of subsequent blocks to satisfy
-						 * the allocation request are free.
-						 */
-						pa = VM_PAGE_TO_PHYS(m_ret);
-						pa_last = pa + size;
-						for (;;) {
-							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
-							if (pa >= pa_last)
-								break;
-							if (pa < seg->start ||
-							    pa >= seg->end)
-								break;
-							m = &seg->first_page[atop(pa - seg->start)];
-							if (m->order != VM_NFREEORDER - 1)
-								break;
-						}
-						/* If not, continue to the next block. */
-						if (pa < pa_last)
-							continue;
-					}
-
-					/*
-					 * Determine if the blocks are within the given range,
-					 * satisfy the given alignment, and do not cross the
-					 * given boundary.
+					 * Determine if a sufficient number of
+					 * subsequent blocks to satisfy the
+					 * allocation request are free.
 					 */
 					pa = VM_PAGE_TO_PHYS(m_ret);
-					if (pa >= low &&
-					    pa + size <= high &&
-					    (pa & (alignment - 1)) == 0 &&
-					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
-						goto done;
+					pa_end = pa + size;
+					for (;;) {
+						pa += 1 << (PAGE_SHIFT +
+						    VM_NFREEORDER - 1);
+						if (pa >= pa_end ||
+						    pa < seg->start ||
+						    pa >= seg->end)
+							break;
+						m = &seg->first_page[atop(pa -
+						    seg->start)];
+						if (m->order != VM_NFREEORDER -
+						    1)
+							break;
+					}
+					/* If not, go to the next block. */
+					if (pa < pa_end)
+						continue;
 				}
+
+				/*
+				 * Determine if the blocks are within the
+				 * given range, satisfy the given alignment,
+				 * and do not cross the given boundary.
+				 */
+				pa = VM_PAGE_TO_PHYS(m_ret);
+				pa_end = pa + size;
+				if (pa >= low && pa_end <= high && (pa &
+				    (alignment - 1)) == 0 && ((pa ^ (pa_end -
+				    1)) & ~(boundary - 1)) == 0)
+					goto done;
 			}
 		}
 	}
-	if (!vm_domain_iterator_isdone(&vi))
-		goto restartdom;
-	vm_policy_iterator_finish(&vi);
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@ -84,6 +84,8 @@ void vm_phys_free_contig(vm_page_t m, u_long npages);
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
 vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
+vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary, int options);
 void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 boolean_t vm_phys_zero_pages_idle(void);
--- a/sys/vm/vm_reserv.c
+++ b/sys/vm/vm_reserv.c
@ -865,6 +865,35 @@ vm_reserv_init(void)
 	}
 }

+/*
+ * Returns true if the given page belongs to a reservation and that page is
+ * free.  Otherwise, returns false.
+ */
+bool
+vm_reserv_is_page_free(vm_page_t m)
+{
+	vm_reserv_t rv;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	rv = vm_reserv_from_page(m);
+	if (rv->object == NULL)
+		return (false);
+	return (popmap_is_clear(rv->popmap, m - rv->pages));
+}
+
+/*
+ * If the given page belongs to a reservation, returns the level of that
+ * reservation.  Otherwise, returns -1.
+ */
+int
+vm_reserv_level(vm_page_t m)
+{
+	vm_reserv_t rv;
+
+	rv = vm_reserv_from_page(m);
+	return (rv->object != NULL ? 0 : -1);
+}
+
 /*
 * Returns a reservation level if the given page belongs to a fully-populated
 * reservation and -1 otherwise.
@ -1075,6 +1104,23 @@ vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 	}
 }

+/*
+ * Returns the size (in bytes) of a reservation of the specified level.
+ */
+int
+vm_reserv_size(int level)
+{
+
+	switch (level) {
+	case 0:
+		return (VM_LEVEL_0_SIZE);
+	case -1:
+		return (PAGE_SIZE);
+	default:
+		return (0);
+	}
+}
+
 /*
 * Allocates the virtual and physical memory required by the reservation
 * management system's data structures, in particular, the reservation array.
--- a/sys/vm/vm_reserv.h
+++ b/sys/vm/vm_reserv.h
@ -53,6 +53,8 @@ vm_page_t	vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
 void		vm_reserv_break_all(vm_object_t object);
 boolean_t	vm_reserv_free_page(vm_page_t m);
 void		vm_reserv_init(void);
+bool		vm_reserv_is_page_free(vm_page_t m);
+int		vm_reserv_level(vm_page_t m);
 int		vm_reserv_level_iffullpop(vm_page_t m);
 boolean_t	vm_reserv_reactivate_page(vm_page_t m);
 boolean_t	vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
@ -60,6 +62,7 @@ boolean_t	vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
 boolean_t	vm_reserv_reclaim_inactive(void);
 void		vm_reserv_rename(vm_page_t m, vm_object_t new_object,
 		    vm_object_t old_object, vm_pindex_t old_object_offset);
+int		vm_reserv_size(int level);
 vm_paddr_t	vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end,
 		    vm_paddr_t high_water);