From 8b03c8ed5ea3bda230e983629dfc471e69b0f7f2 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 29 May 2000 22:40:54 +0000 Subject: [PATCH] This is a cleanup patch to Peter's new OBJT_PHYS VM object type and sysv shared memory support for it. It implements a new PG_UNMANAGED flag that has slightly different characteristics from PG_FICTICIOUS. A new sysctl, kern.ipc.shm_use_phys has been added to enable the use of physically-backed sysv shared memory rather then swap-backed. Physically backed shm segments are not tracked with PV entries, allowing programs which use a large shm segment as a rendezvous point to operate without eating an insane amount of KVM in the PV entry management. Read: Oracle. Peter's OBJT_PHYS object will also allow us to eventually implement page-table sharing and/or 4MB physical page support for such segments. We're half way there. --- sys/amd64/amd64/pmap.c | 11 ++++++-- sys/i386/i386/pmap.c | 11 ++++++-- sys/kern/sysv_shm.c | 17 +++++++----- sys/vm/phys_pager.c | 21 ++------------ sys/vm/vm_fault.c | 2 +- sys/vm/vm_object.c | 13 +++++++-- sys/vm/vm_object.h | 2 ++ sys/vm/vm_page.c | 63 +++++++++++++++++++++++++++++++++++++----- sys/vm/vm_page.h | 9 ++++++ sys/vm/vm_pageout.c | 16 +++++++---- 10 files changed, 118 insertions(+), 47 deletions(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 5e3d6b5c517..7b274469c12 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -2095,7 +2095,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ - if (pmap_initialized && (m->flags & PG_FICTITIOUS) == 0) { + if (pmap_initialized && + (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } @@ -2223,7 +2224,8 @@ retry: * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ - pmap_insert_entry(pmap, va, mpte, m); + if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) + pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters @@ -2235,7 +2237,10 @@ retry: /* * Now validate mapping with RO protection */ - *pte = pa | PG_V | PG_U | PG_MANAGED; + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + *pte = pa | PG_V | PG_U; + else + *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 5e3d6b5c517..7b274469c12 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -2095,7 +2095,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ - if (pmap_initialized && (m->flags & PG_FICTITIOUS) == 0) { + if (pmap_initialized && + (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } @@ -2223,7 +2224,8 @@ retry: * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ - pmap_insert_entry(pmap, va, mpte, m); + if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) + pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters @@ -2235,7 +2237,10 @@ retry: /* * Now validate mapping with RO protection */ - *pte = pa | PG_V | PG_U | PG_MANAGED; + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + *pte = pa | PG_V | PG_U; + else + *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 16019dcfebb..11a55686162 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -126,12 +126,15 @@ struct shminfo shminfo = { SHMALL }; +static int shm_use_phys; + SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RD, &shminfo.shmmni, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RW, &shminfo.shmseg, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW, &shm_use_phys, 0, ""); static int shm_find_segment_by_key(key) @@ -528,13 +531,13 @@ shmget_allocate_segment(p, uap, mode) * We make sure that we have allocated a pager before we need * to. */ -#ifdef SHM_PHYS_BACKED - shm_handle->shm_object = - vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0); -#else - shm_handle->shm_object = - vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0); -#endif + if (shm_use_phys) { + shm_handle->shm_object = + vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0); + } else { + shm_handle->shm_object = + vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0); + } vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT); diff --git a/sys/vm/phys_pager.c b/sys/vm/phys_pager.c index a48126c1a62..586844f3383 100644 --- a/sys/vm/phys_pager.c +++ b/sys/vm/phys_pager.c @@ -104,7 +104,9 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, object = vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(foff + size)); object->handle = handle; +#if 0 TAILQ_INIT(&object->un_pager.physp.physp_pglist); +#endif TAILQ_INSERT_TAIL(&phys_pager_object_list, object, pager_object_list); } else { @@ -131,20 +133,6 @@ phys_pager_dealloc(object) int s; TAILQ_REMOVE(&phys_pager_object_list, object, pager_object_list); - /* - * Free up our fake pages. - */ - s = splvm(); - while ((m = TAILQ_FIRST(&object->un_pager.physp.physp_pglist)) != 0) { - TAILQ_REMOVE(&object->un_pager.physp.physp_pglist, m, pageq); - /* return the page back to normal */ - m->flags &= ~PG_FICTITIOUS; - m->dirty = 0; - vm_page_unwire(m, 0); - vm_page_flag_clear(m, PG_ZERO); - vm_page_free(m); - } - splx(s); } static int @@ -165,8 +153,7 @@ phys_pager_getpages(object, m, count, reqpage) vm_page_zero_fill(m[i]); vm_page_flag_set(m[i], PG_ZERO); /* Switch off pv_entries */ - vm_page_wire(m[i]); - vm_page_flag_set(m[i], PG_FICTITIOUS); + vm_page_unmanage(m[i]); m[i]->valid = VM_PAGE_BITS_ALL; m[i]->dirty = 0; /* The requested page must remain busy, the others not. */ @@ -174,8 +161,6 @@ phys_pager_getpages(object, m, count, reqpage) vm_page_flag_clear(m[i], PG_BUSY); m[i]->busy = 0; } - TAILQ_INSERT_TAIL(&object->un_pager.physp.physp_pglist, m[i], - pageq); } splx(s); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 14133fa8e7f..af4fe35cb65 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -423,7 +423,7 @@ readrest: if (mt == NULL || (mt->valid != VM_PAGE_BITS_ALL)) break; if (mt->busy || - (mt->flags & (PG_BUSY | PG_FICTITIOUS)) || + (mt->flags & (PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED)) || mt->hold_count || mt->wire_count) continue; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1b33f786768..e5403d14b50 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -833,12 +833,14 @@ shadowlookup: /* * If the page is busy or not in a normal active state, - * we skip it. Things can break if we mess with pages - * in any of the below states. + * we skip it. If the page is not managed there are no + * page queues to mess with. Things can break if we mess + * with pages in any of the below states. */ if ( m->hold_count || m->wire_count || + (m->flags & PG_UNMANAGED) || m->valid != VM_PAGE_BITS_ALL ) { continue; @@ -1394,6 +1396,13 @@ vm_object_page_remove(object, start, end, clean_only) all = ((end == 0) && (start == 0)); + /* + * Since physically-backed objects do not use managed pages, we can't + * remove pages from the object (we must instead remove the page + * references, and then destroy the object). + */ + KASSERT(object->type != OBJT_PHYS, ("attempt to remove pages from a physical object")); + vm_object_pip_add(object, 1); again: size = end - start; diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 62a3bbe2416..c9f239c6071 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -123,6 +123,7 @@ struct vm_object { TAILQ_HEAD(, vm_page) devp_pglist; } devp; +#if 0 /* * Physmem pager * @@ -131,6 +132,7 @@ struct vm_object { struct { TAILQ_HEAD(, vm_page) physp_pglist; } physp; +#endif /* * Swap pager diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 6b2b320f22e..9701150cbcb 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -688,7 +688,7 @@ vm_page_select_cache(object, pindex) (pindex + object->pg_color) & PQ_L2_MASK, FALSE ); - if (m && ((m->flags & PG_BUSY) || m->busy || + if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->hold_count || m->wire_count)) { vm_page_deactivate(m); continue; @@ -997,7 +997,7 @@ vm_page_activate(m) vm_page_unqueue(m); - if (m->wire_count == 0) { + if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { m->queue = PQ_ACTIVE; vm_page_queues[PQ_ACTIVE].lcnt++; TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq); @@ -1128,9 +1128,17 @@ vm_page_free_toq(vm_page_t m) } } + /* + * Clear the UNMANAGED flag when freeing an unmanaged page. + */ + + if (m->flags & PG_UNMANAGED) { + m->flags &= ~PG_UNMANAGED; + } else { #ifdef __alpha__ - pmap_page_is_free(m); + pmap_page_is_free(m); #endif + } m->queue = PQ_FREE + m->pc; pq = &vm_page_queues[m->queue]; @@ -1154,6 +1162,39 @@ vm_page_free_toq(vm_page_t m) splx(s); } +/* + * vm_page_unmanage: + * + * Prevent PV management from being done on the page. The page is + * removed from the paging queues as if it were wired, and as a + * consequence of no longer being managed the pageout daemon will not + * touch it (since there is no way to locate the pte mappings for the + * page). madvise() calls that mess with the pmap will also no longer + * operate on the page. + * + * Beyond that the page is still reasonably 'normal'. Freeing the page + * will clear the flag. + * + * This routine is used by OBJT_PHYS objects - objects using unswappable + * physical memory as backing store rather then swap-backed memory and + * will eventually be extended to support 4MB unmanaged physical + * mappings. + */ + +void +vm_page_unmanage(vm_page_t m) +{ + int s; + + s = splvm(); + if ((m->flags & PG_UNMANAGED) == 0) { + if (m->wire_count == 0) + vm_page_unqueue(m); + } + vm_page_flag_set(m, PG_UNMANAGED); + splx(s); +} + /* * vm_page_wire: * @@ -1170,9 +1211,15 @@ vm_page_wire(m) { int s; + /* + * Only bump the wire statistics if the page is not already wired, + * and only unqueue the page if it is on some queue (if it is unmanaged + * it is already off the queues). + */ s = splvm(); if (m->wire_count == 0) { - vm_page_unqueue(m); + if ((m->flags & PG_UNMANAGED) == 0) + vm_page_unqueue(m); cnt.v_wire_count++; } m->wire_count++; @@ -1218,7 +1265,9 @@ vm_page_unwire(m, activate) m->wire_count--; if (m->wire_count == 0) { cnt.v_wire_count--; - if (activate) { + if (m->flags & PG_UNMANAGED) { + ; + } else if (activate) { TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq); m->queue = PQ_ACTIVE; vm_page_queues[PQ_ACTIVE].lcnt++; @@ -1259,7 +1308,7 @@ _vm_page_deactivate(vm_page_t m, int athead) return; s = splvm(); - if (m->wire_count == 0) { + if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_page_unqueue(m); @@ -1293,7 +1342,7 @@ vm_page_cache(m) { int s; - if ((m->flags & PG_BUSY) || m->busy || m->wire_count) { + if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->wire_count) { printf("vm_page_cache: attempting to cache busy page\n"); return; } diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index e61be7fd531..a25c6ac7bb4 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -225,6 +225,13 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; * These are the flags defined for vm_page. * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. + * + * Note: PG_UNMANAGED (used by OBJT_PHYS) indicates that the page is + * not under PV management but otherwise should be treated as a + * normal page. Pages not under PV management cannot be paged out + * via the object/vm_page_t because there is no knowledge of their + * pte mappings, nor can they be removed from their objects via + * the object, and such pages are also not on any PQ queue. */ #define PG_BUSY 0x0001 /* page is in transit (O) */ #define PG_WANTED 0x0002 /* someone is waiting for page (O) */ @@ -236,6 +243,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; #define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */ #define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */ #define PG_NOSYNC 0x0400 /* do not collect for syncer */ +#define PG_UNMANAGED 0x0800 /* No PV management for page */ /* * Misc constants. @@ -399,6 +407,7 @@ void vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); vm_page_t vm_add_new_page __P((vm_offset_t pa)); +void vm_page_unmanage __P((vm_page_t)); void vm_page_unwire __P((vm_page_t, int)); void vm_page_wire __P((vm_page_t)); void vm_page_unqueue __P((vm_page_t)); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 16271180333..307dd0b7527 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -233,11 +233,12 @@ vm_pageout_clean(m) */ /* - * Don't mess with the page if it's busy. + * Don't mess with the page if it's busy, held, or special */ if ((m->hold_count != 0) || - ((m->busy != 0) || (m->flags & PG_BUSY))) + ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) { return 0; + } mc[vm_pageout_page_count] = m; pageout_count = 1; @@ -279,7 +280,7 @@ more: break; } if (((p->queue - p->pc) == PQ_CACHE) || - (p->flags & PG_BUSY) || p->busy) { + (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { ib = 0; break; } @@ -309,7 +310,7 @@ more: if ((p = vm_page_lookup(object, pindex + is)) == NULL) break; if (((p->queue - p->pc) == PQ_CACHE) || - (p->flags & PG_BUSY) || p->busy) { + (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { break; } vm_page_test_dirty(p); @@ -474,7 +475,7 @@ vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only) if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 || - (p->flags & PG_BUSY) || + (p->flags & (PG_BUSY|PG_UNMANAGED)) || !pmap_page_exists(vm_map_pmap(map), p)) { p = next; continue; @@ -1047,7 +1048,10 @@ rescan0: m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE); if (!m) break; - if ((m->flags & PG_BUSY) || m->busy || m->hold_count || m->wire_count) { + if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || + m->busy || + m->hold_count || + m->wire_count) { #ifdef INVARIANTS printf("Warning: busy page %p found in cache\n", m); #endif