From 4d19f4ad1fbf6bbe78f9b23b0ba4bfdc0e9421f2 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Thu, 28 Aug 2014 19:50:08 +0000 Subject: [PATCH] Refactor ZFS ARC reclaim logic to be more VM cooperative Prior to this change we triggered ARC reclaim when kmem usage passed 3/4 of the total available, as indicated by vmem_size(kmem_arena, VMEM_ALLOC). This could lead large amounts of unused RAM e.g. on a 192GB machine with ARC the only major RAM consumer, 40GB of RAM would remain unused. The old method has also been seen to result in extreme RAM usage under certain loads, causing poor performance and stalls. We now trigger ARC reclaim when the number of free pages drops below the value defined by the new sysctl vfs.zfs.arc_free_target, which defaults to the value of vm.v_free_target. Credit to Karl Denninger for the original patch on which this update was based. PR: 191510 and 187594 Tested by: dteske MFC after: 1 week Relnotes: yes Sponsored by: Multiplay --- .../opensolaris/kern/opensolaris_kmem.c | 43 +++++++++-- sys/cddl/compat/opensolaris/sys/kmem.h | 11 ++- .../opensolaris/uts/common/fs/zfs/arc.c | 73 ++++++++++++++++--- sys/vm/vm_pageout.c | 25 +++++-- 4 files changed, 128 insertions(+), 24 deletions(-) diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c index daedc894f06f..10377cd29b7f 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c @@ -126,6 +126,42 @@ kmem_size_init(void *unused __unused) } SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL); +/* + * The return values from kmem_free_* are only valid once the pagedaemon + * has been initialised, before then they return 0. + * + * To ensure the returns are valid the caller can use a SYSINIT with + * subsystem set to SI_SUB_KTHREAD_PAGE and an order of at least + * SI_ORDER_SECOND. + */ +u_int +kmem_free_target(void) +{ + + return (vm_cnt.v_free_target); +} + +u_int +kmem_free_min(void) +{ + + return (vm_cnt.v_free_min); +} + +u_int +kmem_free_count(void) +{ + + return (vm_cnt.v_free_count); +} + +u_int +kmem_page_count(void) +{ + + return (vm_cnt.v_page_count); +} + uint64_t kmem_size(void) { @@ -133,13 +169,6 @@ kmem_size(void) return (kmem_size_val); } -uint64_t -kmem_used(void) -{ - - return (vmem_size(kmem_arena, VMEM_ALLOC)); -} - static int kmem_std_constructor(void *mem, int size __unused, void *private, int flags) { diff --git a/sys/cddl/compat/opensolaris/sys/kmem.h b/sys/cddl/compat/opensolaris/sys/kmem.h index ee6b33f7a982..af6cec52cf91 100644 --- a/sys/cddl/compat/opensolaris/sys/kmem.h +++ b/sys/cddl/compat/opensolaris/sys/kmem.h @@ -66,7 +66,16 @@ typedef struct kmem_cache { void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); -uint64_t kmem_used(void); +u_int kmem_page_count(void); + +/* + * The return values from kmem_free_* are only valid once the pagedaemon + * has been initialised, before then they return 0. + */ +u_int kmem_free_count(void); +u_int kmem_free_target(void); +u_int kmem_free_min(void); + kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index cda427a0dab7..1d97718c70bf 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -193,9 +193,6 @@ extern int zfs_prefetch_disable; */ static boolean_t arc_warm; -/* - * These tunables are for performance analysis. - */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; @@ -204,6 +201,20 @@ int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +u_int zfs_arc_free_target = (1 << 19); /* default before pagedaemon init only */ + +static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); + +#ifdef _KERNEL +static void +arc_free_target_init(void *unused __unused) +{ + + zfs_arc_free_target = kmem_free_target(); +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); +#endif TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); SYSCTL_DECL(_vfs_zfs); @@ -214,6 +225,35 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); +/* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); + +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < kmem_free_min()) + return (EINVAL); + if (val > kmem_page_count()) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} /* * Note that buffers can be in one of 6 states: @@ -2418,9 +2458,12 @@ arc_flush(spa_t *spa) void arc_shrink(void) { + if (arc_c > arc_c_min) { uint64_t to_free; + DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t, + arc_c_min); #ifdef _KERNEL to_free = arc_c >> arc_shrink_shift; #else @@ -2440,8 +2483,11 @@ arc_shrink(void) ASSERT((int64_t)arc_p >= 0); } - if (arc_size > arc_c) + if (arc_size > arc_c) { + DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, + uint64_t, arc_c); arc_adjust(); + } } static int needfree = 0; @@ -2452,15 +2498,25 @@ arc_reclaim_needed(void) #ifdef _KERNEL - if (needfree) + if (needfree) { + DTRACE_PROBE(arc__reclaim_needfree); return (1); + } + + if (kmem_free_count() < zfs_arc_free_target) { + DTRACE_PROBE2(arc__reclaim_freetarget, uint64_t, + kmem_free_count(), uint64_t, zfs_arc_free_target); + return (1); + } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ - if (vm_paging_needed()) + if (vm_paging_needed()) { + DTRACE_PROBE(arc__reclaim_paging); return (1); + } #ifdef sun /* @@ -2504,15 +2560,14 @@ arc_reclaim_needed(void) (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) return (1); #endif -#else /* !sun */ - if (kmem_used() > (kmem_size() * 3) / 4) - return (1); #endif /* sun */ #else if (spa_get_random(100) == 0) return (1); #endif + DTRACE_PROBE(arc__reclaim_no); + return (0); } diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 760865c7fe56..9835d8d60c58 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -115,10 +115,14 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); +static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); +SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, + NULL); + struct proc *pageproc; static struct kproc_desc page_kp = { @@ -126,7 +130,7 @@ static struct kproc_desc page_kp = { vm_pageout, &pageproc }; -SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, +SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); #if !defined(NO_SWAPPING) @@ -1640,15 +1644,11 @@ vm_pageout_worker(void *arg) } /* - * vm_pageout is the high level pageout daemon. + * vm_pageout_init initialises basic pageout daemon settings. */ static void -vm_pageout(void) +vm_pageout_init(void) { -#if MAXMEMDOM > 1 - int error, i; -#endif - /* * Initialize some paging parameters. */ @@ -1694,6 +1694,17 @@ vm_pageout(void) /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; +} + +/* + * vm_pageout is the high level pageout daemon. + */ +static void +vm_pageout(void) +{ +#if MAXMEMDOM > 1 + int error, i; +#endif swap_pager_swap_init(); #if MAXMEMDOM > 1