mirror of
https://git.FreeBSD.org/src.git
synced 2025-01-03 12:35:02 +00:00
8484 Implement aggregate sum and use for arc counters
In pursuit of improving performance on multi-core systems, we should implements fanned out counters and use them to improve the performance of some of the arc statistics. These stats are updated extremely frequently, and can consume a significant amount of CPU time. Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Dan McDonald <danmcd@joyent.com> Author: Paul Dagnelie <pcd@delphix.com>
This commit is contained in:
parent
695c9b6645
commit
ba31d3992e
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/vendor-sys/illumos/dist/; revision=331400
@ -1339,12 +1339,14 @@ LUA_OBJS += \
|
|||||||
|
|
||||||
ZFS_COMMON_OBJS += \
|
ZFS_COMMON_OBJS += \
|
||||||
abd.o \
|
abd.o \
|
||||||
|
aggsum.o \
|
||||||
arc.o \
|
arc.o \
|
||||||
blkptr.o \
|
blkptr.o \
|
||||||
bplist.o \
|
bplist.o \
|
||||||
bpobj.o \
|
bpobj.o \
|
||||||
bptree.o \
|
bptree.o \
|
||||||
bqueue.o \
|
bqueue.o \
|
||||||
|
cityhash.o \
|
||||||
dbuf.o \
|
dbuf.o \
|
||||||
ddt.o \
|
ddt.o \
|
||||||
ddt_zap.o \
|
ddt_zap.o \
|
||||||
|
19
uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
Normal file
19
uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
Copyright (c) 2011 Google, Inc.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
1
uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
Normal file
1
uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
Normal file
@ -0,0 +1 @@
|
|||||||
|
CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
|
232
uts/common/fs/zfs/aggsum.c
Normal file
232
uts/common/fs/zfs/aggsum.c
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* This file and its contents are supplied under the terms of the
|
||||||
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
* You may only use this file in accordance with the terms of version
|
||||||
|
* 1.0 of the CDDL.
|
||||||
|
*
|
||||||
|
* A full copy of the text of the CDDL should have accompanied this
|
||||||
|
* source. A copy of the CDDL is also available via the Internet at
|
||||||
|
* http://www.illumos.org/license/CDDL.
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2017 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/zfs_context.h>
|
||||||
|
#include <sys/aggsum.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Aggregate-sum counters are a form of fanned-out counter, used when atomic
|
||||||
|
* instructions on a single field cause enough CPU cache line contention to
|
||||||
|
* slow system performance. Due to their increased overhead and the expense
|
||||||
|
* involved with precisely reading from them, they should only be used in cases
|
||||||
|
* where the write rate (increment/decrement) is much higher than the read rate
|
||||||
|
* (get value).
|
||||||
|
*
|
||||||
|
* Aggregate sum counters are comprised of two basic parts, the core and the
|
||||||
|
* buckets. The core counter contains a lock for the entire counter, as well
|
||||||
|
* as the current upper and lower bounds on the value of the counter. The
|
||||||
|
* aggsum_bucket structure contains a per-bucket lock to protect the contents of
|
||||||
|
* the bucket, the current amount that this bucket has changed from the global
|
||||||
|
* counter (called the delta), and the amount of increment and decrement we have
|
||||||
|
* "borrowed" from the core counter.
|
||||||
|
*
|
||||||
|
* The basic operation of an aggsum is simple. Threads that wish to modify the
|
||||||
|
* counter will modify one bucket's counter (determined by their current CPU, to
|
||||||
|
* help minimize lock and cache contention). If the bucket already has
|
||||||
|
* sufficient capacity borrowed from the core structure to handle their request,
|
||||||
|
* they simply modify the delta and return. If the bucket does not, we clear
|
||||||
|
* the bucket's current state (to prevent the borrowed amounts from getting too
|
||||||
|
* large), and borrow more from the core counter. Borrowing is done by adding to
|
||||||
|
* the upper bound (or subtracting from the lower bound) of the core counter,
|
||||||
|
* and setting the borrow value for the bucket to the amount added (or
|
||||||
|
* subtracted). Clearing the bucket is the opposite; we add the current delta
|
||||||
|
* to both the lower and upper bounds of the core counter, subtract the borrowed
|
||||||
|
* incremental from the upper bound, and add the borrowed decrement from the
|
||||||
|
* lower bound. Note that only borrowing and clearing require access to the
|
||||||
|
* core counter; since all other operations access CPU-local resources,
|
||||||
|
* performance can be much higher than a traditional counter.
|
||||||
|
*
|
||||||
|
* Threads that wish to read from the counter have a slightly more challenging
|
||||||
|
* task. It is fast to determine the upper and lower bounds of the aggum; this
|
||||||
|
* does not require grabbing any locks. This suffices for cases where an
|
||||||
|
* approximation of the aggsum's value is acceptable. However, if one needs to
|
||||||
|
* know whether some specific value is above or below the current value in the
|
||||||
|
* aggsum, they invoke aggsum_compare(). This function operates by repeatedly
|
||||||
|
* comparing the target value to the upper and lower bounds of the aggsum, and
|
||||||
|
* then clearing a bucket. This proceeds until the target is outside of the
|
||||||
|
* upper and lower bounds and we return a response, or the last bucket has been
|
||||||
|
* cleared and we know that the target is equal to the aggsum's value. Finally,
|
||||||
|
* the most expensive operation is determining the precise value of the aggsum.
|
||||||
|
* To do this, we clear every bucket and then return the upper bound (which must
|
||||||
|
* be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
|
||||||
|
* expensive is clearing buckets. This involves grabbing the global lock
|
||||||
|
* (serializing against themselves and borrow operations), grabbing a bucket's
|
||||||
|
* lock (preventing threads on those CPUs from modifying their delta), and
|
||||||
|
* zeroing out the borrowed value (forcing that thread to borrow on its next
|
||||||
|
* request, which will also be expensive). This is what makes aggsums well
|
||||||
|
* suited for write-many read-rarely operations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We will borrow aggsum_borrow_multiplier times the current request, so we will
|
||||||
|
* have to get the as_lock approximately every aggsum_borrow_multiplier calls to
|
||||||
|
* aggsum_delta().
|
||||||
|
*/
|
||||||
|
static uint_t aggsum_borrow_multiplier = 10;
|
||||||
|
|
||||||
|
void
|
||||||
|
aggsum_init(aggsum_t *as, uint64_t value)
|
||||||
|
{
|
||||||
|
bzero(as, sizeof (*as));
|
||||||
|
as->as_lower_bound = as->as_upper_bound = value;
|
||||||
|
mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
as->as_numbuckets = boot_ncpus;
|
||||||
|
as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
|
||||||
|
KM_SLEEP);
|
||||||
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
|
mutex_init(&as->as_buckets[i].asc_lock,
|
||||||
|
NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
aggsum_fini(aggsum_t *as)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < as->as_numbuckets; i++)
|
||||||
|
mutex_destroy(&as->as_buckets[i].asc_lock);
|
||||||
|
mutex_destroy(&as->as_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t
|
||||||
|
aggsum_lower_bound(aggsum_t *as)
|
||||||
|
{
|
||||||
|
return (as->as_lower_bound);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t
|
||||||
|
aggsum_upper_bound(aggsum_t *as)
|
||||||
|
{
|
||||||
|
return (as->as_upper_bound);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
|
||||||
|
{
|
||||||
|
ASSERT(MUTEX_HELD(&as->as_lock));
|
||||||
|
ASSERT(MUTEX_HELD(&asb->asc_lock));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We use atomic instructions for this because we read the upper and
|
||||||
|
* lower bounds without the lock, so we need stores to be atomic.
|
||||||
|
*/
|
||||||
|
atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta);
|
||||||
|
atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta);
|
||||||
|
asb->asc_delta = 0;
|
||||||
|
atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
|
||||||
|
-asb->asc_borrowed);
|
||||||
|
atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
|
||||||
|
asb->asc_borrowed);
|
||||||
|
asb->asc_borrowed = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t
|
||||||
|
aggsum_value(aggsum_t *as)
|
||||||
|
{
|
||||||
|
int64_t rv;
|
||||||
|
|
||||||
|
mutex_enter(&as->as_lock);
|
||||||
|
if (as->as_lower_bound == as->as_upper_bound) {
|
||||||
|
rv = as->as_lower_bound;
|
||||||
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
|
ASSERT0(as->as_buckets[i].asc_delta);
|
||||||
|
ASSERT0(as->as_buckets[i].asc_borrowed);
|
||||||
|
}
|
||||||
|
mutex_exit(&as->as_lock);
|
||||||
|
return (rv);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
|
struct aggsum_bucket *asb = &as->as_buckets[i];
|
||||||
|
mutex_enter(&asb->asc_lock);
|
||||||
|
aggsum_flush_bucket(as, asb);
|
||||||
|
mutex_exit(&asb->asc_lock);
|
||||||
|
}
|
||||||
|
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
|
||||||
|
rv = as->as_lower_bound;
|
||||||
|
mutex_exit(&as->as_lock);
|
||||||
|
|
||||||
|
return (rv);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb)
|
||||||
|
{
|
||||||
|
int64_t abs_delta = (delta < 0 ? -delta : delta);
|
||||||
|
mutex_enter(&as->as_lock);
|
||||||
|
mutex_enter(&asb->asc_lock);
|
||||||
|
|
||||||
|
aggsum_flush_bucket(as, asb);
|
||||||
|
|
||||||
|
atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta);
|
||||||
|
atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta);
|
||||||
|
asb->asc_borrowed = abs_delta;
|
||||||
|
|
||||||
|
mutex_exit(&asb->asc_lock);
|
||||||
|
mutex_exit(&as->as_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
aggsum_add(aggsum_t *as, int64_t delta)
|
||||||
|
{
|
||||||
|
struct aggsum_bucket *asb =
|
||||||
|
&as->as_buckets[CPU_SEQID % as->as_numbuckets];
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
mutex_enter(&asb->asc_lock);
|
||||||
|
if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
|
||||||
|
asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
|
||||||
|
asb->asc_delta += delta;
|
||||||
|
mutex_exit(&asb->asc_lock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
mutex_exit(&asb->asc_lock);
|
||||||
|
aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compare the aggsum value to target efficiently. Returns -1 if the value
|
||||||
|
* represented by the aggsum is less than target, 1 if it's greater, and 0 if
|
||||||
|
* they are equal.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
aggsum_compare(aggsum_t *as, uint64_t target)
|
||||||
|
{
|
||||||
|
if (as->as_upper_bound < target)
|
||||||
|
return (-1);
|
||||||
|
if (as->as_lower_bound > target)
|
||||||
|
return (1);
|
||||||
|
mutex_enter(&as->as_lock);
|
||||||
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
|
struct aggsum_bucket *asb = &as->as_buckets[i];
|
||||||
|
mutex_enter(&asb->asc_lock);
|
||||||
|
aggsum_flush_bucket(as, asb);
|
||||||
|
mutex_exit(&asb->asc_lock);
|
||||||
|
if (as->as_upper_bound < target) {
|
||||||
|
mutex_exit(&as->as_lock);
|
||||||
|
return (-1);
|
||||||
|
}
|
||||||
|
if (as->as_lower_bound > target) {
|
||||||
|
mutex_exit(&as->as_lock);
|
||||||
|
return (1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
|
||||||
|
ASSERT3U(as->as_lower_bound, ==, target);
|
||||||
|
mutex_exit(&as->as_lock);
|
||||||
|
return (0);
|
||||||
|
}
|
@ -275,6 +275,8 @@
|
|||||||
#include <sys/callb.h>
|
#include <sys/callb.h>
|
||||||
#include <sys/kstat.h>
|
#include <sys/kstat.h>
|
||||||
#include <zfs_fletcher.h>
|
#include <zfs_fletcher.h>
|
||||||
|
#include <sys/aggsum.h>
|
||||||
|
#include <sys/cityhash.h>
|
||||||
|
|
||||||
#ifndef _KERNEL
|
#ifndef _KERNEL
|
||||||
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
|
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
|
||||||
@ -465,6 +467,7 @@ typedef struct arc_stats {
|
|||||||
kstat_named_t arcstat_c;
|
kstat_named_t arcstat_c;
|
||||||
kstat_named_t arcstat_c_min;
|
kstat_named_t arcstat_c_min;
|
||||||
kstat_named_t arcstat_c_max;
|
kstat_named_t arcstat_c_max;
|
||||||
|
/* Not updated directly; only synced in arc_kstat_update. */
|
||||||
kstat_named_t arcstat_size;
|
kstat_named_t arcstat_size;
|
||||||
/*
|
/*
|
||||||
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
|
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
|
||||||
@ -493,12 +496,14 @@ typedef struct arc_stats {
|
|||||||
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
|
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
|
||||||
* caches), and arc_buf_t structures (allocated via arc_buf_t
|
* caches), and arc_buf_t structures (allocated via arc_buf_t
|
||||||
* cache).
|
* cache).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_hdr_size;
|
kstat_named_t arcstat_hdr_size;
|
||||||
/*
|
/*
|
||||||
* Number of bytes consumed by ARC buffers of type equal to
|
* Number of bytes consumed by ARC buffers of type equal to
|
||||||
* ARC_BUFC_DATA. This is generally consumed by buffers backing
|
* ARC_BUFC_DATA. This is generally consumed by buffers backing
|
||||||
* on disk user data (e.g. plain file contents).
|
* on disk user data (e.g. plain file contents).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_data_size;
|
kstat_named_t arcstat_data_size;
|
||||||
/*
|
/*
|
||||||
@ -506,6 +511,7 @@ typedef struct arc_stats {
|
|||||||
* ARC_BUFC_METADATA. This is generally consumed by buffers
|
* ARC_BUFC_METADATA. This is generally consumed by buffers
|
||||||
* backing on disk data that is used for internal ZFS
|
* backing on disk data that is used for internal ZFS
|
||||||
* structures (e.g. ZAP, dnode, indirect blocks, etc).
|
* structures (e.g. ZAP, dnode, indirect blocks, etc).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_metadata_size;
|
kstat_named_t arcstat_metadata_size;
|
||||||
/*
|
/*
|
||||||
@ -514,6 +520,7 @@ typedef struct arc_stats {
|
|||||||
* buffers (allocated directly via zio_buf_* functions),
|
* buffers (allocated directly via zio_buf_* functions),
|
||||||
* dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
|
* dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
|
||||||
* cache), and dnode_t structures (allocated via dnode_t cache).
|
* cache), and dnode_t structures (allocated via dnode_t cache).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_other_size;
|
kstat_named_t arcstat_other_size;
|
||||||
/*
|
/*
|
||||||
@ -521,6 +528,7 @@ typedef struct arc_stats {
|
|||||||
* arc_anon state. This includes *all* buffers in the arc_anon
|
* arc_anon state. This includes *all* buffers in the arc_anon
|
||||||
* state; e.g. data, metadata, evictable, and unevictable buffers
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
||||||
* are all included in this value.
|
* are all included in this value.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_anon_size;
|
kstat_named_t arcstat_anon_size;
|
||||||
/*
|
/*
|
||||||
@ -528,6 +536,7 @@ typedef struct arc_stats {
|
|||||||
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
||||||
* residing in the arc_anon state, and are eligible for eviction
|
* residing in the arc_anon state, and are eligible for eviction
|
||||||
* (e.g. have no outstanding holds on the buffer).
|
* (e.g. have no outstanding holds on the buffer).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_anon_evictable_data;
|
kstat_named_t arcstat_anon_evictable_data;
|
||||||
/*
|
/*
|
||||||
@ -535,6 +544,7 @@ typedef struct arc_stats {
|
|||||||
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
||||||
* residing in the arc_anon state, and are eligible for eviction
|
* residing in the arc_anon state, and are eligible for eviction
|
||||||
* (e.g. have no outstanding holds on the buffer).
|
* (e.g. have no outstanding holds on the buffer).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_anon_evictable_metadata;
|
kstat_named_t arcstat_anon_evictable_metadata;
|
||||||
/*
|
/*
|
||||||
@ -542,6 +552,7 @@ typedef struct arc_stats {
|
|||||||
* arc_mru state. This includes *all* buffers in the arc_mru
|
* arc_mru state. This includes *all* buffers in the arc_mru
|
||||||
* state; e.g. data, metadata, evictable, and unevictable buffers
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
||||||
* are all included in this value.
|
* are all included in this value.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mru_size;
|
kstat_named_t arcstat_mru_size;
|
||||||
/*
|
/*
|
||||||
@ -549,6 +560,7 @@ typedef struct arc_stats {
|
|||||||
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
||||||
* residing in the arc_mru state, and are eligible for eviction
|
* residing in the arc_mru state, and are eligible for eviction
|
||||||
* (e.g. have no outstanding holds on the buffer).
|
* (e.g. have no outstanding holds on the buffer).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mru_evictable_data;
|
kstat_named_t arcstat_mru_evictable_data;
|
||||||
/*
|
/*
|
||||||
@ -556,6 +568,7 @@ typedef struct arc_stats {
|
|||||||
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
||||||
* residing in the arc_mru state, and are eligible for eviction
|
* residing in the arc_mru state, and are eligible for eviction
|
||||||
* (e.g. have no outstanding holds on the buffer).
|
* (e.g. have no outstanding holds on the buffer).
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mru_evictable_metadata;
|
kstat_named_t arcstat_mru_evictable_metadata;
|
||||||
/*
|
/*
|
||||||
@ -566,18 +579,21 @@ typedef struct arc_stats {
|
|||||||
* don't actually have ARC buffers linked off of these headers.
|
* don't actually have ARC buffers linked off of these headers.
|
||||||
* Thus, *if* the headers had associated ARC buffers, these
|
* Thus, *if* the headers had associated ARC buffers, these
|
||||||
* buffers *would have* consumed this number of bytes.
|
* buffers *would have* consumed this number of bytes.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mru_ghost_size;
|
kstat_named_t arcstat_mru_ghost_size;
|
||||||
/*
|
/*
|
||||||
* Number of bytes that *would have been* consumed by ARC
|
* Number of bytes that *would have been* consumed by ARC
|
||||||
* buffers that are eligible for eviction, of type
|
* buffers that are eligible for eviction, of type
|
||||||
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
|
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mru_ghost_evictable_data;
|
kstat_named_t arcstat_mru_ghost_evictable_data;
|
||||||
/*
|
/*
|
||||||
* Number of bytes that *would have been* consumed by ARC
|
* Number of bytes that *would have been* consumed by ARC
|
||||||
* buffers that are eligible for eviction, of type
|
* buffers that are eligible for eviction, of type
|
||||||
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mru_ghost_evictable_metadata;
|
kstat_named_t arcstat_mru_ghost_evictable_metadata;
|
||||||
/*
|
/*
|
||||||
@ -585,36 +601,42 @@ typedef struct arc_stats {
|
|||||||
* arc_mfu state. This includes *all* buffers in the arc_mfu
|
* arc_mfu state. This includes *all* buffers in the arc_mfu
|
||||||
* state; e.g. data, metadata, evictable, and unevictable buffers
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
||||||
* are all included in this value.
|
* are all included in this value.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mfu_size;
|
kstat_named_t arcstat_mfu_size;
|
||||||
/*
|
/*
|
||||||
* Number of bytes consumed by ARC buffers that are eligible for
|
* Number of bytes consumed by ARC buffers that are eligible for
|
||||||
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
|
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
|
||||||
* state.
|
* state.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mfu_evictable_data;
|
kstat_named_t arcstat_mfu_evictable_data;
|
||||||
/*
|
/*
|
||||||
* Number of bytes consumed by ARC buffers that are eligible for
|
* Number of bytes consumed by ARC buffers that are eligible for
|
||||||
* eviction, of type ARC_BUFC_METADATA, and reside in the
|
* eviction, of type ARC_BUFC_METADATA, and reside in the
|
||||||
* arc_mfu state.
|
* arc_mfu state.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mfu_evictable_metadata;
|
kstat_named_t arcstat_mfu_evictable_metadata;
|
||||||
/*
|
/*
|
||||||
* Total number of bytes that *would have been* consumed by ARC
|
* Total number of bytes that *would have been* consumed by ARC
|
||||||
* buffers in the arc_mfu_ghost state. See the comment above
|
* buffers in the arc_mfu_ghost state. See the comment above
|
||||||
* arcstat_mru_ghost_size for more details.
|
* arcstat_mru_ghost_size for more details.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mfu_ghost_size;
|
kstat_named_t arcstat_mfu_ghost_size;
|
||||||
/*
|
/*
|
||||||
* Number of bytes that *would have been* consumed by ARC
|
* Number of bytes that *would have been* consumed by ARC
|
||||||
* buffers that are eligible for eviction, of type
|
* buffers that are eligible for eviction, of type
|
||||||
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
|
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mfu_ghost_evictable_data;
|
kstat_named_t arcstat_mfu_ghost_evictable_data;
|
||||||
/*
|
/*
|
||||||
* Number of bytes that *would have been* consumed by ARC
|
* Number of bytes that *would have been* consumed by ARC
|
||||||
* buffers that are eligible for eviction, of type
|
* buffers that are eligible for eviction, of type
|
||||||
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
||||||
|
* Not updated directly; only synced in arc_kstat_update.
|
||||||
*/
|
*/
|
||||||
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
|
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
|
||||||
kstat_named_t arcstat_l2_hits;
|
kstat_named_t arcstat_l2_hits;
|
||||||
@ -636,8 +658,10 @@ typedef struct arc_stats {
|
|||||||
kstat_named_t arcstat_l2_io_error;
|
kstat_named_t arcstat_l2_io_error;
|
||||||
kstat_named_t arcstat_l2_lsize;
|
kstat_named_t arcstat_l2_lsize;
|
||||||
kstat_named_t arcstat_l2_psize;
|
kstat_named_t arcstat_l2_psize;
|
||||||
|
/* Not updated directly; only synced in arc_kstat_update. */
|
||||||
kstat_named_t arcstat_l2_hdr_size;
|
kstat_named_t arcstat_l2_hdr_size;
|
||||||
kstat_named_t arcstat_memory_throttle_count;
|
kstat_named_t arcstat_memory_throttle_count;
|
||||||
|
/* Not updated directly; only synced in arc_kstat_update. */
|
||||||
kstat_named_t arcstat_meta_used;
|
kstat_named_t arcstat_meta_used;
|
||||||
kstat_named_t arcstat_meta_limit;
|
kstat_named_t arcstat_meta_limit;
|
||||||
kstat_named_t arcstat_meta_max;
|
kstat_named_t arcstat_meta_max;
|
||||||
@ -784,14 +808,12 @@ static arc_state_t *arc_l2c_only;
|
|||||||
* the possibility of inconsistency by having shadow copies of the variables,
|
* the possibility of inconsistency by having shadow copies of the variables,
|
||||||
* while still allowing the code to be readable.
|
* while still allowing the code to be readable.
|
||||||
*/
|
*/
|
||||||
#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
|
|
||||||
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
|
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
|
||||||
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
|
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
|
||||||
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
|
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
|
||||||
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
|
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
|
||||||
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
|
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
|
||||||
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
|
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
|
||||||
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
|
|
||||||
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
|
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
|
||||||
|
|
||||||
/* compressed size of entire arc */
|
/* compressed size of entire arc */
|
||||||
@ -801,6 +823,22 @@ static arc_state_t *arc_l2c_only;
|
|||||||
/* number of bytes in the arc from arc_buf_t's */
|
/* number of bytes in the arc from arc_buf_t's */
|
||||||
#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
|
#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There are also some ARC variables that we want to export, but that are
|
||||||
|
* updated so often that having the canonical representation be the statistic
|
||||||
|
* variable causes a performance bottleneck. We want to use aggsum_t's for these
|
||||||
|
* instead, but still be able to export the kstat in the same way as before.
|
||||||
|
* The solution is to always use the aggsum version, except in the kstat update
|
||||||
|
* callback.
|
||||||
|
*/
|
||||||
|
aggsum_t arc_size;
|
||||||
|
aggsum_t arc_meta_used;
|
||||||
|
aggsum_t astat_data_size;
|
||||||
|
aggsum_t astat_metadata_size;
|
||||||
|
aggsum_t astat_hdr_size;
|
||||||
|
aggsum_t astat_other_size;
|
||||||
|
aggsum_t astat_l2_hdr_size;
|
||||||
|
|
||||||
static int arc_no_grow; /* Don't try to grow cache size */
|
static int arc_no_grow; /* Don't try to grow cache size */
|
||||||
static uint64_t arc_tempreserve;
|
static uint64_t arc_tempreserve;
|
||||||
static uint64_t arc_loaned_bytes;
|
static uint64_t arc_loaned_bytes;
|
||||||
@ -1118,21 +1156,15 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
|
|||||||
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
|
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
|
||||||
static void l2arc_read_done(zio_t *);
|
static void l2arc_read_done(zio_t *);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We use Cityhash for this. It's fast, and has good hash properties without
|
||||||
|
* requiring any large static buffers.
|
||||||
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
|
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
|
||||||
{
|
{
|
||||||
uint8_t *vdva = (uint8_t *)dva;
|
return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
|
||||||
uint64_t crc = -1ULL;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
|
||||||
|
|
||||||
for (i = 0; i < sizeof (dva_t); i++)
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
|
|
||||||
|
|
||||||
crc ^= (spa>>8) ^ birth;
|
|
||||||
|
|
||||||
return (crc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HDR_EMPTY(hdr) \
|
#define HDR_EMPTY(hdr) \
|
||||||
@ -2322,26 +2354,26 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
|
|||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case ARC_SPACE_DATA:
|
case ARC_SPACE_DATA:
|
||||||
ARCSTAT_INCR(arcstat_data_size, space);
|
aggsum_add(&astat_data_size, space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_META:
|
case ARC_SPACE_META:
|
||||||
ARCSTAT_INCR(arcstat_metadata_size, space);
|
aggsum_add(&astat_metadata_size, space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_OTHER:
|
case ARC_SPACE_OTHER:
|
||||||
ARCSTAT_INCR(arcstat_other_size, space);
|
aggsum_add(&astat_other_size, space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_HDRS:
|
case ARC_SPACE_HDRS:
|
||||||
ARCSTAT_INCR(arcstat_hdr_size, space);
|
aggsum_add(&astat_hdr_size, space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_L2HDRS:
|
case ARC_SPACE_L2HDRS:
|
||||||
ARCSTAT_INCR(arcstat_l2_hdr_size, space);
|
aggsum_add(&astat_l2_hdr_size, space);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type != ARC_SPACE_DATA)
|
if (type != ARC_SPACE_DATA)
|
||||||
ARCSTAT_INCR(arcstat_meta_used, space);
|
aggsum_add(&arc_meta_used, space);
|
||||||
|
|
||||||
atomic_add_64(&arc_size, space);
|
aggsum_add(&arc_size, space);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -2351,31 +2383,36 @@ arc_space_return(uint64_t space, arc_space_type_t type)
|
|||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case ARC_SPACE_DATA:
|
case ARC_SPACE_DATA:
|
||||||
ARCSTAT_INCR(arcstat_data_size, -space);
|
aggsum_add(&astat_data_size, -space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_META:
|
case ARC_SPACE_META:
|
||||||
ARCSTAT_INCR(arcstat_metadata_size, -space);
|
aggsum_add(&astat_metadata_size, -space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_OTHER:
|
case ARC_SPACE_OTHER:
|
||||||
ARCSTAT_INCR(arcstat_other_size, -space);
|
aggsum_add(&astat_other_size, -space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_HDRS:
|
case ARC_SPACE_HDRS:
|
||||||
ARCSTAT_INCR(arcstat_hdr_size, -space);
|
aggsum_add(&astat_hdr_size, -space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_L2HDRS:
|
case ARC_SPACE_L2HDRS:
|
||||||
ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
|
aggsum_add(&astat_l2_hdr_size, -space);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type != ARC_SPACE_DATA) {
|
if (type != ARC_SPACE_DATA) {
|
||||||
ASSERT(arc_meta_used >= space);
|
ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
|
||||||
if (arc_meta_max < arc_meta_used)
|
/*
|
||||||
arc_meta_max = arc_meta_used;
|
* We use the upper bound here rather than the precise value
|
||||||
ARCSTAT_INCR(arcstat_meta_used, -space);
|
* because the arc_meta_max value doesn't need to be
|
||||||
|
* precise. It's only consumed by humans via arcstats.
|
||||||
|
*/
|
||||||
|
if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
|
||||||
|
arc_meta_max = aggsum_upper_bound(&arc_meta_used);
|
||||||
|
aggsum_add(&arc_meta_used, -space);
|
||||||
}
|
}
|
||||||
|
|
||||||
ASSERT(arc_size >= space);
|
ASSERT(aggsum_compare(&arc_size, space) >= 0);
|
||||||
atomic_add_64(&arc_size, -space);
|
aggsum_add(&arc_size, -space);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3573,7 +3610,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
|
|||||||
* capped by the arc_meta_limit tunable.
|
* capped by the arc_meta_limit tunable.
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
arc_adjust_meta(void)
|
arc_adjust_meta(uint64_t meta_used)
|
||||||
{
|
{
|
||||||
uint64_t total_evicted = 0;
|
uint64_t total_evicted = 0;
|
||||||
int64_t target;
|
int64_t target;
|
||||||
@ -3585,7 +3622,7 @@ arc_adjust_meta(void)
|
|||||||
* we're over the meta limit more than we're over arc_p, we
|
* we're over the meta limit more than we're over arc_p, we
|
||||||
* evict some from the MRU here, and some from the MFU below.
|
* evict some from the MRU here, and some from the MFU below.
|
||||||
*/
|
*/
|
||||||
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
target = MIN((int64_t)(meta_used - arc_meta_limit),
|
||||||
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
||||||
refcount_count(&arc_mru->arcs_size) - arc_p));
|
refcount_count(&arc_mru->arcs_size) - arc_p));
|
||||||
|
|
||||||
@ -3596,8 +3633,9 @@ arc_adjust_meta(void)
|
|||||||
* below the meta limit, but not so much as to drop us below the
|
* below the meta limit, but not so much as to drop us below the
|
||||||
* space allotted to the MFU (which is defined as arc_c - arc_p).
|
* space allotted to the MFU (which is defined as arc_c - arc_p).
|
||||||
*/
|
*/
|
||||||
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
target = MIN((int64_t)(meta_used - arc_meta_limit),
|
||||||
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
|
(int64_t)(refcount_count(&arc_mfu->arcs_size) -
|
||||||
|
(arc_c - arc_p)));
|
||||||
|
|
||||||
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
||||||
|
|
||||||
@ -3688,12 +3726,14 @@ arc_adjust(void)
|
|||||||
uint64_t total_evicted = 0;
|
uint64_t total_evicted = 0;
|
||||||
uint64_t bytes;
|
uint64_t bytes;
|
||||||
int64_t target;
|
int64_t target;
|
||||||
|
uint64_t asize = aggsum_value(&arc_size);
|
||||||
|
uint64_t ameta = aggsum_value(&arc_meta_used);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we're over arc_meta_limit, we want to correct that before
|
* If we're over arc_meta_limit, we want to correct that before
|
||||||
* potentially evicting data buffers below.
|
* potentially evicting data buffers below.
|
||||||
*/
|
*/
|
||||||
total_evicted += arc_adjust_meta();
|
total_evicted += arc_adjust_meta(ameta);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Adjust MRU size
|
* Adjust MRU size
|
||||||
@ -3705,9 +3745,9 @@ arc_adjust(void)
|
|||||||
* the MRU is over arc_p, we'll evict enough to get back to
|
* the MRU is over arc_p, we'll evict enough to get back to
|
||||||
* arc_p here, and then evict more from the MFU below.
|
* arc_p here, and then evict more from the MFU below.
|
||||||
*/
|
*/
|
||||||
target = MIN((int64_t)(arc_size - arc_c),
|
target = MIN((int64_t)(asize - arc_c),
|
||||||
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
||||||
refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
|
refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we're below arc_meta_min, always prefer to evict data.
|
* If we're below arc_meta_min, always prefer to evict data.
|
||||||
@ -3718,7 +3758,7 @@ arc_adjust(void)
|
|||||||
* type, spill over into the next type.
|
* type, spill over into the next type.
|
||||||
*/
|
*/
|
||||||
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
|
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
|
||||||
arc_meta_used > arc_meta_min) {
|
ameta > arc_meta_min) {
|
||||||
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
||||||
total_evicted += bytes;
|
total_evicted += bytes;
|
||||||
|
|
||||||
@ -3751,10 +3791,10 @@ arc_adjust(void)
|
|||||||
* size back to arc_p, if we're still above the target cache
|
* size back to arc_p, if we're still above the target cache
|
||||||
* size, we evict the rest from the MFU.
|
* size, we evict the rest from the MFU.
|
||||||
*/
|
*/
|
||||||
target = arc_size - arc_c;
|
target = asize - arc_c;
|
||||||
|
|
||||||
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
|
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
|
||||||
arc_meta_used > arc_meta_min) {
|
ameta > arc_meta_min) {
|
||||||
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
||||||
total_evicted += bytes;
|
total_evicted += bytes;
|
||||||
|
|
||||||
@ -3855,6 +3895,7 @@ arc_flush(spa_t *spa, boolean_t retry)
|
|||||||
void
|
void
|
||||||
arc_shrink(int64_t to_free)
|
arc_shrink(int64_t to_free)
|
||||||
{
|
{
|
||||||
|
uint64_t asize = aggsum_value(&arc_size);
|
||||||
if (arc_c > arc_c_min) {
|
if (arc_c > arc_c_min) {
|
||||||
|
|
||||||
if (arc_c > arc_c_min + to_free)
|
if (arc_c > arc_c_min + to_free)
|
||||||
@ -3863,15 +3904,15 @@ arc_shrink(int64_t to_free)
|
|||||||
arc_c = arc_c_min;
|
arc_c = arc_c_min;
|
||||||
|
|
||||||
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
|
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
|
||||||
if (arc_c > arc_size)
|
if (asize < arc_c)
|
||||||
arc_c = MAX(arc_size, arc_c_min);
|
arc_c = MAX(asize, arc_c_min);
|
||||||
if (arc_p > arc_c)
|
if (arc_p > arc_c)
|
||||||
arc_p = (arc_c >> 1);
|
arc_p = (arc_c >> 1);
|
||||||
ASSERT(arc_c >= arc_c_min);
|
ASSERT(arc_c >= arc_c_min);
|
||||||
ASSERT((int64_t)arc_p >= 0);
|
ASSERT((int64_t)arc_p >= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arc_size > arc_c)
|
if (asize > arc_c)
|
||||||
(void) arc_adjust();
|
(void) arc_adjust();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4035,7 +4076,7 @@ arc_kmem_reap_now(void)
|
|||||||
extern kmem_cache_t *abd_chunk_cache;
|
extern kmem_cache_t *abd_chunk_cache;
|
||||||
|
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
if (arc_meta_used >= arc_meta_limit) {
|
if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
|
||||||
/*
|
/*
|
||||||
* We are exceeding our meta-data cache limit.
|
* We are exceeding our meta-data cache limit.
|
||||||
* Purge some DNLC entries to release holds on meta-data.
|
* Purge some DNLC entries to release holds on meta-data.
|
||||||
@ -4194,7 +4235,7 @@ arc_reclaim_thread(void *unused)
|
|||||||
* be helpful and could potentially cause us to enter an
|
* be helpful and could potentially cause us to enter an
|
||||||
* infinite loop.
|
* infinite loop.
|
||||||
*/
|
*/
|
||||||
if (arc_size <= arc_c || evicted == 0) {
|
if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
|
||||||
/*
|
/*
|
||||||
* We're either no longer overflowing, or we
|
* We're either no longer overflowing, or we
|
||||||
* can't evict anything more, so we should wake
|
* can't evict anything more, so we should wake
|
||||||
@ -4276,7 +4317,8 @@ arc_adapt(int bytes, arc_state_t *state)
|
|||||||
* If we're within (2 * maxblocksize) bytes of the target
|
* If we're within (2 * maxblocksize) bytes of the target
|
||||||
* cache size, increment the target cache size
|
* cache size, increment the target cache size
|
||||||
*/
|
*/
|
||||||
if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
|
if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
|
||||||
|
0) {
|
||||||
atomic_add_64(&arc_c, (int64_t)bytes);
|
atomic_add_64(&arc_c, (int64_t)bytes);
|
||||||
if (arc_c > arc_c_max)
|
if (arc_c > arc_c_max)
|
||||||
arc_c = arc_c_max;
|
arc_c = arc_c_max;
|
||||||
@ -4299,7 +4341,16 @@ arc_is_overflowing(void)
|
|||||||
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
|
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
|
||||||
arc_c >> zfs_arc_overflow_shift);
|
arc_c >> zfs_arc_overflow_shift);
|
||||||
|
|
||||||
return (arc_size >= arc_c + overflow);
|
/*
|
||||||
|
* We just compare the lower bound here for performance reasons. Our
|
||||||
|
* primary goals are to make sure that the arc never grows without
|
||||||
|
* bound, and that it can reach its maximum size. This check
|
||||||
|
* accomplishes both goals. The maximum amount we could run over by is
|
||||||
|
* 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
|
||||||
|
* in the ARC. In practice, that's in the tens of MB, which is low
|
||||||
|
* enough to be safe.
|
||||||
|
*/
|
||||||
|
return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
|
||||||
}
|
}
|
||||||
|
|
||||||
static abd_t *
|
static abd_t *
|
||||||
@ -4414,7 +4465,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
|
|||||||
* If we are growing the cache, and we are adding anonymous
|
* If we are growing the cache, and we are adding anonymous
|
||||||
* data, and we have outgrown arc_p, update arc_p
|
* data, and we have outgrown arc_p, update arc_p
|
||||||
*/
|
*/
|
||||||
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
|
if (aggsum_compare(&arc_size, arc_c) < 0 &&
|
||||||
|
hdr->b_l1hdr.b_state == arc_anon &&
|
||||||
(refcount_count(&arc_anon->arcs_size) +
|
(refcount_count(&arc_anon->arcs_size) +
|
||||||
refcount_count(&arc_mru->arcs_size) > arc_p))
|
refcount_count(&arc_mru->arcs_size) > arc_p))
|
||||||
arc_p = MIN(arc_c, arc_p + size);
|
arc_p = MIN(arc_c, arc_p + size);
|
||||||
@ -5850,6 +5902,15 @@ arc_kstat_update(kstat_t *ksp, int rw)
|
|||||||
&as->arcstat_mfu_ghost_size,
|
&as->arcstat_mfu_ghost_size,
|
||||||
&as->arcstat_mfu_ghost_evictable_data,
|
&as->arcstat_mfu_ghost_evictable_data,
|
||||||
&as->arcstat_mfu_ghost_evictable_metadata);
|
&as->arcstat_mfu_ghost_evictable_metadata);
|
||||||
|
|
||||||
|
ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
|
||||||
|
ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
|
||||||
|
ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
|
||||||
|
ARCSTAT(arcstat_metadata_size) =
|
||||||
|
aggsum_value(&astat_metadata_size);
|
||||||
|
ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
|
||||||
|
ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size);
|
||||||
|
ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
@ -5960,6 +6021,14 @@ arc_state_init(void)
|
|||||||
refcount_create(&arc_mfu->arcs_size);
|
refcount_create(&arc_mfu->arcs_size);
|
||||||
refcount_create(&arc_mfu_ghost->arcs_size);
|
refcount_create(&arc_mfu_ghost->arcs_size);
|
||||||
refcount_create(&arc_l2c_only->arcs_size);
|
refcount_create(&arc_l2c_only->arcs_size);
|
||||||
|
|
||||||
|
aggsum_init(&arc_meta_used, 0);
|
||||||
|
aggsum_init(&arc_size, 0);
|
||||||
|
aggsum_init(&astat_data_size, 0);
|
||||||
|
aggsum_init(&astat_metadata_size, 0);
|
||||||
|
aggsum_init(&astat_hdr_size, 0);
|
||||||
|
aggsum_init(&astat_other_size, 0);
|
||||||
|
aggsum_init(&astat_l2_hdr_size, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -6052,7 +6121,6 @@ arc_init(void)
|
|||||||
|
|
||||||
arc_c = arc_c_max;
|
arc_c = arc_c_max;
|
||||||
arc_p = (arc_c >> 1);
|
arc_p = (arc_c >> 1);
|
||||||
arc_size = 0;
|
|
||||||
|
|
||||||
/* limit meta-data to 1/4 of the arc capacity */
|
/* limit meta-data to 1/4 of the arc capacity */
|
||||||
arc_meta_limit = arc_c_max / 4;
|
arc_meta_limit = arc_c_max / 4;
|
||||||
|
63
uts/common/fs/zfs/cityhash.c
Normal file
63
uts/common/fs/zfs/cityhash.c
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
// Copyright (c) 2011 Google, Inc.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
// of this software and associated documentation files (the "Software"), to deal
|
||||||
|
// in the Software without restriction, including without limitation the rights
|
||||||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the Software is
|
||||||
|
// furnished to do so, subject to the following conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be included in
|
||||||
|
// all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
// THE SOFTWARE.
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2017 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/cityhash.h>
|
||||||
|
|
||||||
|
#define HASH_K1 0xb492b66fbe98f273ULL
|
||||||
|
#define HASH_K2 0x9ae16a3b2f90404fULL
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bitwise right rotate. Normally this will compile to a single
|
||||||
|
* instruction.
|
||||||
|
*/
|
||||||
|
static inline uint64_t
|
||||||
|
rotate(uint64_t val, int shift)
|
||||||
|
{
|
||||||
|
// Avoid shifting by 64: doing so yields an undefined result.
|
||||||
|
return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint64_t
|
||||||
|
cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
|
||||||
|
{
|
||||||
|
uint64_t a = (u ^ v) * mul;
|
||||||
|
a ^= (a >> 47);
|
||||||
|
uint64_t b = (v ^ a) * mul;
|
||||||
|
b ^= (b >> 47);
|
||||||
|
b *= mul;
|
||||||
|
return (b);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t
|
||||||
|
cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
|
||||||
|
{
|
||||||
|
uint64_t mul = HASH_K2 + 64;
|
||||||
|
uint64_t a = w1 * HASH_K1;
|
||||||
|
uint64_t b = w2;
|
||||||
|
uint64_t c = w4 * mul;
|
||||||
|
uint64_t d = w3 * HASH_K2;
|
||||||
|
return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
|
||||||
|
a + rotate(b + HASH_K2, 18) + c, mul));
|
||||||
|
|
||||||
|
}
|
@ -48,6 +48,7 @@
|
|||||||
#include <sys/callb.h>
|
#include <sys/callb.h>
|
||||||
#include <sys/abd.h>
|
#include <sys/abd.h>
|
||||||
#include <sys/vdev.h>
|
#include <sys/vdev.h>
|
||||||
|
#include <sys/cityhash.h>
|
||||||
|
|
||||||
uint_t zfs_dbuf_evict_key;
|
uint_t zfs_dbuf_evict_key;
|
||||||
|
|
||||||
@ -167,23 +168,14 @@ static dbuf_hash_table_t dbuf_hash_table;
|
|||||||
|
|
||||||
static uint64_t dbuf_hash_count;
|
static uint64_t dbuf_hash_count;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We use Cityhash for this. It's fast, and has good hash properties without
|
||||||
|
* requiring any large static buffers.
|
||||||
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
|
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
|
||||||
{
|
{
|
||||||
uintptr_t osv = (uintptr_t)os;
|
return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
|
||||||
uint64_t crc = -1ULL;
|
|
||||||
|
|
||||||
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
|
|
||||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
|
|
||||||
|
|
||||||
crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
|
|
||||||
|
|
||||||
return (crc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
|
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
|
||||||
|
60
uts/common/fs/zfs/sys/aggsum.h
Normal file
60
uts/common/fs/zfs/sys/aggsum.h
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* This file and its contents are supplied under the terms of the
|
||||||
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
* You may only use this file in accordance with the terms of version
|
||||||
|
* 1.0 of the CDDL.
|
||||||
|
*
|
||||||
|
* A full copy of the text of the CDDL should have accompanied this
|
||||||
|
* source. A copy of the CDDL is also available via the Internet at
|
||||||
|
* http://www.illumos.org/license/CDDL.
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2017 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _SYS_AGGSUM_H
|
||||||
|
#define _SYS_AGGSUM_H
|
||||||
|
|
||||||
|
#include <sys/zfs_context.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CACHE_LINE_SIZE 64
|
||||||
|
|
||||||
|
typedef struct aggsum_bucket {
|
||||||
|
kmutex_t asc_lock;
|
||||||
|
int64_t asc_delta;
|
||||||
|
uint64_t asc_borrowed;
|
||||||
|
uint64_t asc_pad[4]; /* pad out to cache line (64 bytes) */
|
||||||
|
} aggsum_bucket_t __aligned(CACHE_LINE_SIZE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fan out over FANOUT cpus.
|
||||||
|
*/
|
||||||
|
typedef struct aggsum {
|
||||||
|
kmutex_t as_lock;
|
||||||
|
int64_t as_lower_bound;
|
||||||
|
int64_t as_upper_bound;
|
||||||
|
uint64_t as_numbuckets;
|
||||||
|
aggsum_bucket_t *as_buckets;
|
||||||
|
} aggsum_t;
|
||||||
|
|
||||||
|
void aggsum_init(aggsum_t *, uint64_t);
|
||||||
|
void aggsum_fini(aggsum_t *);
|
||||||
|
int64_t aggsum_lower_bound(aggsum_t *);
|
||||||
|
int64_t aggsum_upper_bound(aggsum_t *);
|
||||||
|
int aggsum_compare(aggsum_t *, uint64_t);
|
||||||
|
uint64_t aggsum_value(aggsum_t *);
|
||||||
|
void aggsum_add(aggsum_t *, int64_t);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _SYS_AGGSUM_H */
|
41
uts/common/fs/zfs/sys/cityhash.h
Normal file
41
uts/common/fs/zfs/sys/cityhash.h
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
// Copyright (c) 2011 Google, Inc.
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
// of this software and associated documentation files (the "Software"), to deal
|
||||||
|
// in the Software without restriction, including without limitation the rights
|
||||||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the Software is
|
||||||
|
// furnished to do so, subject to the following conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be included in
|
||||||
|
// all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
// THE SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2017 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _SYS_CITYHASH_H
|
||||||
|
#define _SYS_CITYHASH_H
|
||||||
|
|
||||||
|
#include <sys/zfs_context.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _SYS_CITYHASH_H */
|
Loading…
Reference in New Issue
Block a user