mirror of
https://git.FreeBSD.org/src.git
synced 2024-12-21 11:13:30 +00:00
4976 zfs should only avoid writing to a failing non-redundant top-level vdev
4977 mdb error in ::spa_space from space_cb() if a metaslab's ms_sm is NULL 4978 ztest fails in get_metaslab_refcount() 4979 extend free space histogram to device and pool 4980 metaslabs should have a fragmentation metric 4981 remove fragmented ops vector from block allocator 4982 space_map object should proactively upgrade when feature is enabled 4983 need to collect metaslab information via mdb 4984 device selection should use fragmentation metric Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <adam.leventhal@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> illumos/illumos-gate@2e4c998613
This commit is contained in:
parent
4fa00fc6d6
commit
d6fb141e08
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/vendor-sys/illumos/dist/; revision=269010
@ -111,11 +111,11 @@ static void
|
||||
usage(void)
|
||||
{
|
||||
(void) fprintf(stderr,
|
||||
"Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
|
||||
"[-U config] [-M inflight I/Os] [-x dumpdir] poolname [object...]\n"
|
||||
"Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
|
||||
"[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
|
||||
" %s [-divPA] [-e -p path...] [-U config] dataset "
|
||||
"[object...]\n"
|
||||
" %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
|
||||
" %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
|
||||
"poolname [vdev [metaslab...]]\n"
|
||||
" %s -R [-A] [-e [-p path...]] poolname "
|
||||
"vdev:offset:size[:flags]\n"
|
||||
@ -138,6 +138,7 @@ usage(void)
|
||||
(void) fprintf(stderr, " -h pool history\n");
|
||||
(void) fprintf(stderr, " -b block statistics\n");
|
||||
(void) fprintf(stderr, " -m metaslabs\n");
|
||||
(void) fprintf(stderr, " -M metaslab groups\n");
|
||||
(void) fprintf(stderr, " -c checksum all metadata (twice for "
|
||||
"all data) blocks\n");
|
||||
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
|
||||
@ -168,7 +169,7 @@ usage(void)
|
||||
(void) fprintf(stderr, " -P print numbers in parseable form\n");
|
||||
(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
|
||||
"searching for uberblocks\n");
|
||||
(void) fprintf(stderr, " -M <number of inflight I/Os> -- "
|
||||
(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
|
||||
"specify the maximum number of "
|
||||
"checksumming I/Os [default is 200]\n");
|
||||
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
|
||||
@ -548,7 +549,7 @@ get_metaslab_refcount(vdev_t *vd)
|
||||
{
|
||||
int refcount = 0;
|
||||
|
||||
if (vd->vdev_top == vd) {
|
||||
if (vd->vdev_top == vd && !vd->vdev_removing) {
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
space_map_t *sm = vd->vdev_ms[m]->ms_sm;
|
||||
|
||||
@ -686,9 +687,10 @@ dump_metaslab(metaslab_t *msp)
|
||||
* The space map histogram represents free space in chunks
|
||||
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
|
||||
*/
|
||||
(void) printf("\tOn-disk histogram:\n");
|
||||
(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
|
||||
(u_longlong_t)msp->ms_fragmentation);
|
||||
dump_histogram(sm->sm_phys->smp_histogram,
|
||||
SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift);
|
||||
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
|
||||
}
|
||||
|
||||
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
|
||||
@ -712,6 +714,47 @@ print_vdev_metaslab_header(vdev_t *vd)
|
||||
"---------------", "-------------");
|
||||
}
|
||||
|
||||
static void
|
||||
dump_metaslab_groups(spa_t *spa)
|
||||
{
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
metaslab_class_t *mc = spa_normal_class(spa);
|
||||
uint64_t fragmentation;
|
||||
|
||||
metaslab_class_histogram_verify(mc);
|
||||
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
|
||||
if (mg->mg_class != mc)
|
||||
continue;
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
|
||||
|
||||
(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
|
||||
"fragmentation",
|
||||
(u_longlong_t)tvd->vdev_id,
|
||||
(u_longlong_t)tvd->vdev_ms_count);
|
||||
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
|
||||
(void) printf("%3s\n", "-");
|
||||
} else {
|
||||
(void) printf("%3llu%%\n",
|
||||
(u_longlong_t)mg->mg_fragmentation);
|
||||
}
|
||||
dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
|
||||
}
|
||||
|
||||
(void) printf("\tpool %s\tfragmentation", spa_name(spa));
|
||||
fragmentation = metaslab_class_fragmentation(mc);
|
||||
if (fragmentation == ZFS_FRAG_INVALID)
|
||||
(void) printf("\t%3s\n", "-");
|
||||
else
|
||||
(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
|
||||
dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_metaslabs(spa_t *spa)
|
||||
{
|
||||
@ -2340,8 +2383,7 @@ zdb_leak(void *arg, uint64_t start, uint64_t size)
|
||||
}
|
||||
|
||||
static metaslab_ops_t zdb_metaslab_ops = {
|
||||
NULL, /* alloc */
|
||||
NULL /* fragmented */
|
||||
NULL /* alloc */
|
||||
};
|
||||
|
||||
static void
|
||||
@ -2836,6 +2878,8 @@ dump_zpool(spa_t *spa)
|
||||
|
||||
if (dump_opt['d'] > 2 || dump_opt['m'])
|
||||
dump_metaslabs(spa);
|
||||
if (dump_opt['M'])
|
||||
dump_metaslab_groups(spa);
|
||||
|
||||
if (dump_opt['d'] || dump_opt['i']) {
|
||||
dump_dir(dp->dp_meta_objset);
|
||||
@ -3330,7 +3374,7 @@ main(int argc, char **argv)
|
||||
dprintf_setup(&argc, argv);
|
||||
|
||||
while ((c = getopt(argc, argv,
|
||||
"bcdhilmM:suCDRSAFLXx:evp:t:U:P")) != -1) {
|
||||
"bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
|
||||
switch (c) {
|
||||
case 'b':
|
||||
case 'c':
|
||||
@ -3343,6 +3387,7 @@ main(int argc, char **argv)
|
||||
case 'u':
|
||||
case 'C':
|
||||
case 'D':
|
||||
case 'M':
|
||||
case 'R':
|
||||
case 'S':
|
||||
dump_opt[c]++;
|
||||
@ -3356,10 +3401,7 @@ main(int argc, char **argv)
|
||||
case 'P':
|
||||
dump_opt[c]++;
|
||||
break;
|
||||
case 'v':
|
||||
verbose++;
|
||||
break;
|
||||
case 'M':
|
||||
case 'I':
|
||||
max_inflight = strtoull(optarg, NULL, 0);
|
||||
if (max_inflight == 0) {
|
||||
(void) fprintf(stderr, "maximum number "
|
||||
@ -3383,9 +3425,6 @@ main(int argc, char **argv)
|
||||
}
|
||||
searchdirs[nsearch++] = optarg;
|
||||
break;
|
||||
case 'x':
|
||||
vn_dumpdir = optarg;
|
||||
break;
|
||||
case 't':
|
||||
max_txg = strtoull(optarg, NULL, 0);
|
||||
if (max_txg < TXG_INITIAL) {
|
||||
@ -3397,6 +3436,12 @@ main(int argc, char **argv)
|
||||
case 'U':
|
||||
spa_config_path = optarg;
|
||||
break;
|
||||
case 'v':
|
||||
verbose++;
|
||||
break;
|
||||
case 'x':
|
||||
vn_dumpdir = optarg;
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
break;
|
||||
|
@ -2754,10 +2754,15 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted)
|
||||
boolean_t fixed;
|
||||
size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
|
||||
|
||||
zfs_nicenum(value, propval, sizeof (propval));
|
||||
|
||||
if (prop == ZPOOL_PROP_EXPANDSZ && value == 0)
|
||||
(void) strlcpy(propval, "-", sizeof (propval));
|
||||
else if (prop == ZPOOL_PROP_FRAGMENTATION && value == ZFS_FRAG_INVALID)
|
||||
(void) strlcpy(propval, "-", sizeof (propval));
|
||||
else if (prop == ZPOOL_PROP_FRAGMENTATION)
|
||||
(void) snprintf(propval, sizeof (propval), "%llu%%", value);
|
||||
else
|
||||
zfs_nicenum(value, propval, sizeof (propval));
|
||||
|
||||
if (scripted)
|
||||
(void) printf("\t%s", propval);
|
||||
@ -2790,9 +2795,9 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
|
||||
/* only toplevel vdevs have capacity stats */
|
||||
if (vs->vs_space == 0) {
|
||||
if (scripted)
|
||||
(void) printf("\t-\t-\t-");
|
||||
(void) printf("\t-\t-\t-\t-");
|
||||
else
|
||||
(void) printf(" - - -");
|
||||
(void) printf(" - - - -");
|
||||
} else {
|
||||
print_one_column(ZPOOL_PROP_SIZE, vs->vs_space,
|
||||
scripted);
|
||||
@ -2800,6 +2805,8 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
|
||||
scripted);
|
||||
print_one_column(ZPOOL_PROP_FREE,
|
||||
vs->vs_space - vs->vs_alloc, scripted);
|
||||
print_one_column(ZPOOL_PROP_FRAGMENTATION,
|
||||
vs->vs_fragmentation, scripted);
|
||||
}
|
||||
print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize,
|
||||
scripted);
|
||||
@ -2885,8 +2892,8 @@ zpool_do_list(int argc, char **argv)
|
||||
int ret;
|
||||
list_cbdata_t cb = { 0 };
|
||||
static char default_props[] =
|
||||
"name,size,allocated,free,expandsize,capacity,dedupratio,"
|
||||
"health,altroot";
|
||||
"name,size,allocated,free,fragmentation,expandsize,capacity,"
|
||||
"dedupratio,health,altroot";
|
||||
char *props = default_props;
|
||||
unsigned long interval = 0, count = 0;
|
||||
zpool_list_t *list;
|
||||
|
@ -21,7 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zio.h>
|
||||
@ -87,6 +87,8 @@ zpool_prop_init(void)
|
||||
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
|
||||
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
|
||||
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
|
||||
zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
|
||||
PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
|
||||
zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
|
||||
ZFS_TYPE_POOL, "<size>", "CAP");
|
||||
zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
|
||||
|
@ -294,6 +294,14 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
|
||||
(u_longlong_t)intval);
|
||||
}
|
||||
break;
|
||||
case ZPOOL_PROP_FRAGMENTATION:
|
||||
if (intval == UINT64_MAX) {
|
||||
(void) strlcpy(buf, "-", len);
|
||||
} else {
|
||||
(void) snprintf(buf, len, "%llu%%",
|
||||
(u_longlong_t)intval);
|
||||
}
|
||||
break;
|
||||
|
||||
case ZPOOL_PROP_DEDUPRATIO:
|
||||
(void) snprintf(buf, len, "%llu.%02llux",
|
||||
|
@ -19,8 +19,8 @@
|
||||
\fBzdb\fR - Display zpool debugging and consistency information
|
||||
|
||||
.SH "SYNOPSIS"
|
||||
\fBzdb\fR [-CumdibcsDvhLXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR]
|
||||
[-U \fIcache\fR] [-M \fIinflight I/Os\fR] [-x \fIdumpdir\fR]
|
||||
\fBzdb\fR [-CumdibcsDvhLMXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR]
|
||||
[-U \fIcache\fR] [-I \fIinflight I/Os\fR] [-x \fIdumpdir\fR]
|
||||
[\fIpoolname\fR [\fIobject\fR ...]]
|
||||
|
||||
.P
|
||||
@ -28,7 +28,7 @@
|
||||
\fIdataset\fR [\fIobject\fR ...]
|
||||
|
||||
.P
|
||||
\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
|
||||
\fBzdb\fR -m [-MLXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
|
||||
\fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]]
|
||||
|
||||
.P
|
||||
@ -194,6 +194,21 @@ verifies that all non-free blocks are referenced, which can be very expensive.
|
||||
.sp .6
|
||||
.RS 4n
|
||||
Display the offset, spacemap, and free space of each metaslab.
|
||||
When specified twice, also display information about the on-disk free
|
||||
space histogram associated with each metaslab. When specified three time,
|
||||
display the maximum contiguous free space, the in-core free space histogram,
|
||||
and the percentage of free space in each space map. When specified
|
||||
four times display every spacemap record.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fB-M\fR
|
||||
.ad
|
||||
.sp .6
|
||||
.RS 4n
|
||||
Display the offset, spacemap, and free space of each metaslab.
|
||||
When specified twice, also display information about the maximum contiguous
|
||||
free space and the percentage of free space in each space map. When specified
|
||||
three times display every spacemap record.
|
||||
@ -380,7 +395,7 @@ transactions.
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fB-M \fIinflight I/Os\fR \fR
|
||||
\fB-I \fIinflight I/Os\fR \fR
|
||||
.ad
|
||||
.sp .6
|
||||
.RS 4n
|
||||
|
@ -1,7 +1,7 @@
|
||||
'\" te
|
||||
.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
|
||||
.\" Copyright 2011, Nexenta Systems, Inc. All Rights Reserved.
|
||||
.\" Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
.\" The contents of this file are subject to the terms of the Common Development
|
||||
.\" and Distribution License (the "License"). You may not use this file except
|
||||
.\" in compliance with the License. You can obtain a copy of the license at
|
||||
@ -570,6 +570,15 @@ any space on an EFI labeled vdev which has not been brought online
|
||||
(i.e. zpool online -e). This space occurs when a LUN is dynamically expanded.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fB\fBfragmentation\fR\fR
|
||||
.ad
|
||||
.RS 20n
|
||||
The amount of fragmentation in the pool.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
@ -1648,7 +1657,7 @@ Display numbers in parsable (exact) values.
|
||||
.RS 12n
|
||||
Comma-separated list of properties to display. See the "Properties" section for
|
||||
a list of valid properties. The default list is "name, size, used, available,
|
||||
expandsize, capacity, dedupratio, health, altroot"
|
||||
fragmentation, expandsize, capacity, dedupratio, health, altroot"
|
||||
.RE
|
||||
|
||||
.sp
|
||||
@ -2035,10 +2044,10 @@ The results from this command are similar to the following:
|
||||
.in +2
|
||||
.nf
|
||||
# \fBzpool list\fR
|
||||
NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT
|
||||
rpool 19.9G 8.43G 11.4G - 42% 1.00x ONLINE -
|
||||
tank 61.5G 20.0G 41.5G - 32% 1.00x ONLINE -
|
||||
zion - - - - - - FAULTED -
|
||||
NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
|
||||
rpool 19.9G 8.43G 11.4G 33% - 42% 1.00x ONLINE -
|
||||
tank 61.5G 20.0G 41.5G 48% - 32% 1.00x ONLINE -
|
||||
zion - - - - - - - FAULTED -
|
||||
.fi
|
||||
.in -2
|
||||
.sp
|
||||
@ -2259,7 +2268,7 @@ The command to remove the mirrored log \fBmirror-2\fR is:
|
||||
.LP
|
||||
The following command dipslays the detailed information for the \fIdata\fR
|
||||
pool. This pool is comprised of a single \fIraidz\fR vdev where one of its
|
||||
devices increased its capacity by 1GB. In this example, the pool will not
|
||||
devices increased its capacity by 10GB. In this example, the pool will not
|
||||
be able to utilized this extra capacity until all the devices under the
|
||||
\fIraidz\fR vdev have been expanded.
|
||||
|
||||
@ -2267,12 +2276,12 @@ be able to utilized this extra capacity until all the devices under the
|
||||
.in +2
|
||||
.nf
|
||||
# \fBzpool list -v data\fR
|
||||
NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT
|
||||
data 17.9G 174K 17.9G - 0% 1.00x ONLINE -
|
||||
raidz1 17.9G 174K 17.9G -
|
||||
c4t2d0 - - - 1G
|
||||
c4t3d0 - - - -
|
||||
c4t4d0 - - - -
|
||||
NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
|
||||
data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE -
|
||||
raidz1 23.9G 14.6G 9.30G 48% -
|
||||
c1t1d0 - - - - -
|
||||
c1t2d0 - - - - 10G
|
||||
c1t3d0 - - - - -
|
||||
.fi
|
||||
.in -2
|
||||
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/spa_impl.h>
|
||||
#include <sys/zfeature.h>
|
||||
|
||||
/*
|
||||
* Allow allocations to switch to gang blocks quickly. We do this to
|
||||
@ -79,7 +80,7 @@ int zfs_metaslab_condense_block_threshold = 4;
|
||||
/*
|
||||
* The zfs_mg_noalloc_threshold defines which metaslab groups should
|
||||
* be eligible for allocation. The value is defined as a percentage of
|
||||
* a free space. Metaslab groups that have more free space than
|
||||
* free space. Metaslab groups that have more free space than
|
||||
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
|
||||
* a metaslab group's free space is less than or equal to the
|
||||
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
|
||||
@ -91,6 +92,23 @@ int zfs_metaslab_condense_block_threshold = 4;
|
||||
*/
|
||||
int zfs_mg_noalloc_threshold = 0;
|
||||
|
||||
/*
|
||||
* Metaslab groups are considered eligible for allocations if their
|
||||
* fragmenation metric (measured as a percentage) is less than or equal to
|
||||
* zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
|
||||
* then it will be skipped unless all metaslab groups within the metaslab
|
||||
* class have also crossed this threshold.
|
||||
*/
|
||||
int zfs_mg_fragmentation_threshold = 85;
|
||||
|
||||
/*
|
||||
* Allow metaslabs to keep their active state as long as their fragmentation
|
||||
* percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
|
||||
* active metaslab that exceeds this threshold will no longer keep its active
|
||||
* status allowing better metaslabs to be selected.
|
||||
*/
|
||||
int zfs_metaslab_fragmentation_threshold = 70;
|
||||
|
||||
/*
|
||||
* When set will load all metaslabs when pool is first opened.
|
||||
*/
|
||||
@ -135,11 +153,6 @@ int metaslab_load_pct = 50;
|
||||
*/
|
||||
int metaslab_unload_delay = TXG_SIZE * 2;
|
||||
|
||||
/*
|
||||
* Should we be willing to write data to degraded vdevs?
|
||||
*/
|
||||
boolean_t zfs_write_to_degraded = B_FALSE;
|
||||
|
||||
/*
|
||||
* Max number of metaslabs per group to preload.
|
||||
*/
|
||||
@ -151,10 +164,21 @@ int metaslab_preload_limit = SPA_DVAS_PER_BP;
|
||||
boolean_t metaslab_preload_enabled = B_TRUE;
|
||||
|
||||
/*
|
||||
* Enable/disable additional weight factor for each metaslab.
|
||||
* Enable/disable fragmentation weighting on metaslabs.
|
||||
*/
|
||||
boolean_t metaslab_weight_factor_enable = B_FALSE;
|
||||
boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
|
||||
|
||||
/*
|
||||
* Enable/disable lba weighting (i.e. outer tracks are given preference).
|
||||
*/
|
||||
boolean_t metaslab_lba_weighting_enabled = B_TRUE;
|
||||
|
||||
/*
|
||||
* Enable/disable metaslab group biasing.
|
||||
*/
|
||||
boolean_t metaslab_bias_enabled = B_TRUE;
|
||||
|
||||
static uint64_t metaslab_fragmentation(metaslab_t *);
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
@ -247,6 +271,121 @@ metaslab_class_get_dspace(metaslab_class_t *mc)
|
||||
return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_class_histogram_verify(metaslab_class_t *mc)
|
||||
{
|
||||
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
|
||||
uint64_t *mc_hist;
|
||||
int i;
|
||||
|
||||
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
|
||||
return;
|
||||
|
||||
mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
|
||||
KM_SLEEP);
|
||||
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
|
||||
/*
|
||||
* Skip any holes, uninitialized top-levels, or
|
||||
* vdevs that are not in this metalab class.
|
||||
*/
|
||||
if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
|
||||
mg->mg_class != mc) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
|
||||
mc_hist[i] += mg->mg_histogram[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
|
||||
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
|
||||
|
||||
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the metaslab class's fragmentation metric. The metric
|
||||
* is weighted based on the space contribution of each metaslab group.
|
||||
* The return value will be a number between 0 and 100 (inclusive), or
|
||||
* ZFS_FRAG_INVALID if the metric has not been set. See comment above the
|
||||
* zfs_frag_table for more information about the metric.
|
||||
*/
|
||||
uint64_t
|
||||
metaslab_class_fragmentation(metaslab_class_t *mc)
|
||||
{
|
||||
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
|
||||
uint64_t fragmentation = 0;
|
||||
|
||||
spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
|
||||
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
|
||||
/*
|
||||
* Skip any holes, uninitialized top-levels, or
|
||||
* vdevs that are not in this metalab class.
|
||||
*/
|
||||
if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
|
||||
mg->mg_class != mc) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If a metaslab group does not contain a fragmentation
|
||||
* metric then just bail out.
|
||||
*/
|
||||
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
|
||||
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
|
||||
return (ZFS_FRAG_INVALID);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine how much this metaslab_group is contributing
|
||||
* to the overall pool fragmentation metric.
|
||||
*/
|
||||
fragmentation += mg->mg_fragmentation *
|
||||
metaslab_group_get_space(mg);
|
||||
}
|
||||
fragmentation /= metaslab_class_get_space(mc);
|
||||
|
||||
ASSERT3U(fragmentation, <=, 100);
|
||||
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
|
||||
return (fragmentation);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the amount of expandable space that is available in
|
||||
* this metaslab class. If a device is expanded then its expandable
|
||||
* space will be the amount of allocatable space that is currently not
|
||||
* part of this metaslab class.
|
||||
*/
|
||||
uint64_t
|
||||
metaslab_class_expandable_space(metaslab_class_t *mc)
|
||||
{
|
||||
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
|
||||
uint64_t space = 0;
|
||||
|
||||
spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
|
||||
if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
|
||||
mg->mg_class != mc) {
|
||||
continue;
|
||||
}
|
||||
|
||||
space += tvd->vdev_max_asize - tvd->vdev_asize;
|
||||
}
|
||||
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
|
||||
return (space);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Metaslab groups
|
||||
@ -299,7 +438,15 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
|
||||
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
|
||||
(vs->vs_space + 1);
|
||||
|
||||
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
|
||||
/*
|
||||
* A metaslab group is considered allocatable if it has plenty
|
||||
* of free space or is not heavily fragmented. We only take
|
||||
* fragmentation into account if the metaslab group has a valid
|
||||
* fragmentation metric (i.e. a value between 0 and 100).
|
||||
*/
|
||||
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
|
||||
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
|
||||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
|
||||
|
||||
/*
|
||||
* The mc_alloc_groups maintains a count of the number of
|
||||
@ -320,6 +467,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
|
||||
mc->mc_alloc_groups--;
|
||||
else if (!was_allocatable && mg->mg_allocatable)
|
||||
mc->mc_alloc_groups++;
|
||||
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
@ -409,6 +557,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
|
||||
}
|
||||
|
||||
taskq_wait(mg->mg_taskq);
|
||||
metaslab_group_alloc_update(mg);
|
||||
|
||||
mgprev = mg->mg_prev;
|
||||
mgnext = mg->mg_next;
|
||||
@ -425,20 +574,113 @@ metaslab_group_passivate(metaslab_group_t *mg)
|
||||
mg->mg_next = NULL;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
metaslab_group_get_space(metaslab_group_t *mg)
|
||||
{
|
||||
return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_group_histogram_verify(metaslab_group_t *mg)
|
||||
{
|
||||
uint64_t *mg_hist;
|
||||
vdev_t *vd = mg->mg_vd;
|
||||
uint64_t ashift = vd->vdev_ashift;
|
||||
int i;
|
||||
|
||||
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
|
||||
return;
|
||||
|
||||
mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
|
||||
KM_SLEEP);
|
||||
|
||||
ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
|
||||
SPACE_MAP_HISTOGRAM_SIZE + ashift);
|
||||
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp->ms_sm == NULL)
|
||||
continue;
|
||||
|
||||
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
|
||||
mg_hist[i + ashift] +=
|
||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
|
||||
VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
|
||||
|
||||
kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
|
||||
{
|
||||
metaslab_class_t *mc = mg->mg_class;
|
||||
uint64_t ashift = mg->mg_vd->vdev_ashift;
|
||||
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
if (msp->ms_sm == NULL)
|
||||
return;
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
||||
mg->mg_histogram[i + ashift] +=
|
||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||
mc->mc_histogram[i + ashift] +=
|
||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||
}
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
|
||||
{
|
||||
metaslab_class_t *mc = mg->mg_class;
|
||||
uint64_t ashift = mg->mg_vd->vdev_ashift;
|
||||
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
if (msp->ms_sm == NULL)
|
||||
return;
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
||||
ASSERT3U(mg->mg_histogram[i + ashift], >=,
|
||||
msp->ms_sm->sm_phys->smp_histogram[i]);
|
||||
ASSERT3U(mc->mc_histogram[i + ashift], >=,
|
||||
msp->ms_sm->sm_phys->smp_histogram[i]);
|
||||
|
||||
mg->mg_histogram[i + ashift] -=
|
||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||
mc->mc_histogram[i + ashift] -=
|
||||
msp->ms_sm->sm_phys->smp_histogram[i];
|
||||
}
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
|
||||
{
|
||||
mutex_enter(&mg->mg_lock);
|
||||
ASSERT(msp->ms_group == NULL);
|
||||
mutex_enter(&mg->mg_lock);
|
||||
msp->ms_group = mg;
|
||||
msp->ms_weight = 0;
|
||||
avl_add(&mg->mg_metaslab_tree, msp);
|
||||
mutex_exit(&mg->mg_lock);
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_group_histogram_add(mg, msp);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
|
||||
{
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_group_histogram_remove(mg, msp);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
ASSERT(msp->ms_group == mg);
|
||||
avl_remove(&mg->mg_metaslab_tree, msp);
|
||||
@ -451,9 +693,9 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
||||
{
|
||||
/*
|
||||
* Although in principle the weight can be any value, in
|
||||
* practice we do not use values in the range [1, 510].
|
||||
* practice we do not use values in the range [1, 511].
|
||||
*/
|
||||
ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
|
||||
ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
@ -464,10 +706,43 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the fragmentation for a given metaslab group. We can use
|
||||
* a simple average here since all metaslabs within the group must have
|
||||
* the same size. The return value will be a value between 0 and 100
|
||||
* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
|
||||
* group have a fragmentation metric.
|
||||
*/
|
||||
uint64_t
|
||||
metaslab_group_fragmentation(metaslab_group_t *mg)
|
||||
{
|
||||
vdev_t *vd = mg->mg_vd;
|
||||
uint64_t fragmentation = 0;
|
||||
uint64_t valid_ms = 0;
|
||||
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
|
||||
continue;
|
||||
|
||||
valid_ms++;
|
||||
fragmentation += msp->ms_fragmentation;
|
||||
}
|
||||
|
||||
if (valid_ms <= vd->vdev_ms_count / 2)
|
||||
return (ZFS_FRAG_INVALID);
|
||||
|
||||
fragmentation /= valid_ms;
|
||||
ASSERT3U(fragmentation, <=, 100);
|
||||
return (fragmentation);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if a given metaslab group should skip allocations. A metaslab
|
||||
* group should avoid allocations if its used capacity has crossed the
|
||||
* zfs_mg_noalloc_threshold and there is at least one metaslab group
|
||||
* group should avoid allocations if its free capacity is less than the
|
||||
* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
|
||||
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
|
||||
* that can still handle allocations.
|
||||
*/
|
||||
static boolean_t
|
||||
@ -478,12 +753,19 @@ metaslab_group_allocatable(metaslab_group_t *mg)
|
||||
metaslab_class_t *mc = mg->mg_class;
|
||||
|
||||
/*
|
||||
* A metaslab group is considered allocatable if its free capacity
|
||||
* is greater than the set value of zfs_mg_noalloc_threshold, it's
|
||||
* associated with a slog, or there are no other metaslab groups
|
||||
* with free capacity greater than zfs_mg_noalloc_threshold.
|
||||
* We use two key metrics to determine if a metaslab group is
|
||||
* considered allocatable -- free space and fragmentation. If
|
||||
* the free space is greater than the free space threshold and
|
||||
* the fragmentation is less than the fragmentation threshold then
|
||||
* consider the group allocatable. There are two case when we will
|
||||
* not consider these key metrics. The first is if the group is
|
||||
* associated with a slog device and the second is if all groups
|
||||
* in this metaslab class have already been consider ineligible
|
||||
* for allocations.
|
||||
*/
|
||||
return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
|
||||
return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
|
||||
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
|
||||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
|
||||
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
|
||||
}
|
||||
|
||||
@ -707,16 +989,8 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
|
||||
return (metaslab_block_picker(t, cursor, size, align));
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static boolean_t
|
||||
metaslab_ff_fragmented(metaslab_t *msp)
|
||||
{
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static metaslab_ops_t metaslab_ff_ops = {
|
||||
metaslab_ff_alloc,
|
||||
metaslab_ff_fragmented
|
||||
metaslab_ff_alloc
|
||||
};
|
||||
|
||||
/*
|
||||
@ -763,23 +1037,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
||||
return (metaslab_block_picker(t, cursor, size, 1ULL));
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
metaslab_df_fragmented(metaslab_t *msp)
|
||||
{
|
||||
range_tree_t *rt = msp->ms_tree;
|
||||
uint64_t max_size = metaslab_block_maxsize(msp);
|
||||
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
|
||||
|
||||
if (max_size >= metaslab_df_alloc_threshold &&
|
||||
free_pct >= metaslab_df_free_pct)
|
||||
return (B_FALSE);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static metaslab_ops_t metaslab_df_ops = {
|
||||
metaslab_df_alloc,
|
||||
metaslab_df_fragmented
|
||||
metaslab_df_alloc
|
||||
};
|
||||
|
||||
/*
|
||||
@ -822,15 +1081,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
|
||||
return (offset);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
metaslab_cf_fragmented(metaslab_t *msp)
|
||||
{
|
||||
return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
|
||||
}
|
||||
|
||||
static metaslab_ops_t metaslab_cf_ops = {
|
||||
metaslab_cf_alloc,
|
||||
metaslab_cf_fragmented
|
||||
metaslab_cf_alloc
|
||||
};
|
||||
|
||||
/*
|
||||
@ -887,16 +1139,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
|
||||
return (-1ULL);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
metaslab_ndf_fragmented(metaslab_t *msp)
|
||||
{
|
||||
return (metaslab_block_maxsize(msp) <=
|
||||
(metaslab_min_alloc_size << metaslab_ndf_clump_shift));
|
||||
}
|
||||
|
||||
static metaslab_ops_t metaslab_ndf_ops = {
|
||||
metaslab_ndf_alloc,
|
||||
metaslab_ndf_fragmented
|
||||
metaslab_ndf_alloc
|
||||
};
|
||||
|
||||
metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
|
||||
@ -998,6 +1242,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
|
||||
msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
|
||||
metaslab_group_add(mg, msp);
|
||||
|
||||
msp->ms_fragmentation = metaslab_fragmentation(msp);
|
||||
msp->ms_ops = mg->mg_class->mc_ops;
|
||||
|
||||
/*
|
||||
@ -1063,69 +1308,113 @@ metaslab_fini(metaslab_t *msp)
|
||||
kmem_free(msp, sizeof (metaslab_t));
|
||||
}
|
||||
|
||||
#define FRAGMENTATION_TABLE_SIZE 17
|
||||
|
||||
/*
|
||||
* Apply a weighting factor based on the histogram information for this
|
||||
* metaslab. The current weighting factor is somewhat arbitrary and requires
|
||||
* additional investigation. The implementation provides a measure of
|
||||
* "weighted" free space and gives a higher weighting for larger contiguous
|
||||
* regions. The weighting factor is determined by counting the number of
|
||||
* sm_shift sectors that exist in each region represented by the histogram.
|
||||
* That value is then multiplied by the power of 2 exponent and the sm_shift
|
||||
* value.
|
||||
* This table defines a segment size based fragmentation metric that will
|
||||
* allow each metaslab to derive its own fragmentation value. This is done
|
||||
* by calculating the space in each bucket of the spacemap histogram and
|
||||
* multiplying that by the fragmetation metric in this table. Doing
|
||||
* this for all buckets and dividing it by the total amount of free
|
||||
* space in this metaslab (i.e. the total free space in all buckets) gives
|
||||
* us the fragmentation metric. This means that a high fragmentation metric
|
||||
* equates to most of the free space being comprised of small segments.
|
||||
* Conversely, if the metric is low, then most of the free space is in
|
||||
* large segments. A 10% change in fragmentation equates to approximately
|
||||
* double the number of segments.
|
||||
*
|
||||
* For example, assume the 2^21 histogram bucket has 4 2MB regions and the
|
||||
* metaslab has an sm_shift value of 9 (512B):
|
||||
*
|
||||
* 1) calculate the number of sm_shift sectors in the region:
|
||||
* 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
|
||||
* 2) multiply by the power of 2 exponent and the sm_shift value:
|
||||
* 16384 * 21 * 9 = 3096576
|
||||
* This value will be added to the weighting of the metaslab.
|
||||
* This table defines 0% fragmented space using 16MB segments. Testing has
|
||||
* shown that segments that are greater than or equal to 16MB do not suffer
|
||||
* from drastic performance problems. Using this value, we derive the rest
|
||||
* of the table. Since the fragmentation value is never stored on disk, it
|
||||
* is possible to change these calculations in the future.
|
||||
*/
|
||||
int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
|
||||
100, /* 512B */
|
||||
100, /* 1K */
|
||||
98, /* 2K */
|
||||
95, /* 4K */
|
||||
90, /* 8K */
|
||||
80, /* 16K */
|
||||
70, /* 32K */
|
||||
60, /* 64K */
|
||||
50, /* 128K */
|
||||
40, /* 256K */
|
||||
30, /* 512K */
|
||||
20, /* 1M */
|
||||
15, /* 2M */
|
||||
10, /* 4M */
|
||||
5, /* 8M */
|
||||
0 /* 16M */
|
||||
};
|
||||
|
||||
/*
|
||||
* Calclate the metaslab's fragmentation metric. A return value
|
||||
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
|
||||
* not support this metric. Otherwise, the return value should be in the
|
||||
* range [0, 100].
|
||||
*/
|
||||
static uint64_t
|
||||
metaslab_weight_factor(metaslab_t *msp)
|
||||
metaslab_fragmentation(metaslab_t *msp)
|
||||
{
|
||||
uint64_t factor = 0;
|
||||
uint64_t sectors;
|
||||
int i;
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
uint64_t fragmentation = 0;
|
||||
uint64_t total = 0;
|
||||
boolean_t feature_enabled = spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_SPACEMAP_HISTOGRAM);
|
||||
|
||||
if (!feature_enabled)
|
||||
return (ZFS_FRAG_INVALID);
|
||||
|
||||
/*
|
||||
* A null space map means that the entire metaslab is free,
|
||||
* calculate a weight factor that spans the entire size of the
|
||||
* metaslab.
|
||||
* A null space map means that the entire metaslab is free
|
||||
* and thus is not fragmented.
|
||||
*/
|
||||
if (msp->ms_sm == NULL) {
|
||||
vdev_t *vd = msp->ms_group->mg_vd;
|
||||
|
||||
i = highbit64(msp->ms_size) - 1;
|
||||
sectors = msp->ms_size >> vd->vdev_ashift;
|
||||
return (sectors * i * vd->vdev_ashift);
|
||||
}
|
||||
|
||||
if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
|
||||
if (msp->ms_sm == NULL)
|
||||
return (0);
|
||||
|
||||
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
|
||||
/*
|
||||
* If this metaslab's space_map has not been upgraded, flag it
|
||||
* so that we upgrade next time we encounter it.
|
||||
*/
|
||||
if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
|
||||
uint64_t txg = spa_syncing_txg(spa);
|
||||
vdev_t *vd = msp->ms_group->mg_vd;
|
||||
|
||||
msp->ms_condense_wanted = B_TRUE;
|
||||
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
|
||||
spa_dbgmsg(spa, "txg %llu, requesting force condense: "
|
||||
"msp %p, vd %p", txg, msp, vd);
|
||||
return (ZFS_FRAG_INVALID);
|
||||
}
|
||||
|
||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
||||
uint64_t space = 0;
|
||||
uint8_t shift = msp->ms_sm->sm_shift;
|
||||
int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
|
||||
FRAGMENTATION_TABLE_SIZE - 1);
|
||||
|
||||
if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Determine the number of sm_shift sectors in the region
|
||||
* indicated by the histogram. For example, given an
|
||||
* sm_shift value of 9 (512 bytes) and i = 4 then we know
|
||||
* that we're looking at an 8K region in the histogram
|
||||
* (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
|
||||
* number of sm_shift sectors (512 bytes in this example),
|
||||
* we would take 8192 / 512 = 16. Since the histogram
|
||||
* is offset by sm_shift we can simply use the value of
|
||||
* of i to calculate this (i.e. 2^i = 16 where i = 4).
|
||||
*/
|
||||
sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
|
||||
factor += (i + msp->ms_sm->sm_shift) * sectors;
|
||||
space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
|
||||
total += space;
|
||||
|
||||
ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
|
||||
fragmentation += space * zfs_frag_table[idx];
|
||||
}
|
||||
return (factor * msp->ms_sm->sm_shift);
|
||||
|
||||
if (total > 0)
|
||||
fragmentation /= total;
|
||||
ASSERT3U(fragmentation, <=, 100);
|
||||
return (fragmentation);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute a weight -- a selection preference value -- for the given metaslab.
|
||||
* This is based on the amount of free space, the level of fragmentation,
|
||||
* the LBA range, and whether the metaslab is loaded.
|
||||
*/
|
||||
static uint64_t
|
||||
metaslab_weight(metaslab_t *msp)
|
||||
{
|
||||
@ -1149,6 +1438,29 @@ metaslab_weight(metaslab_t *msp)
|
||||
* The baseline weight is the metaslab's free space.
|
||||
*/
|
||||
space = msp->ms_size - space_map_allocated(msp->ms_sm);
|
||||
|
||||
msp->ms_fragmentation = metaslab_fragmentation(msp);
|
||||
if (metaslab_fragmentation_factor_enabled &&
|
||||
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
|
||||
/*
|
||||
* Use the fragmentation information to inversely scale
|
||||
* down the baseline weight. We need to ensure that we
|
||||
* don't exclude this metaslab completely when it's 100%
|
||||
* fragmented. To avoid this we reduce the fragmented value
|
||||
* by 1.
|
||||
*/
|
||||
space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
|
||||
|
||||
/*
|
||||
* If space < SPA_MINBLOCKSIZE, then we will not allocate from
|
||||
* this metaslab again. The fragmentation metric may have
|
||||
* decreased the space to something smaller than
|
||||
* SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
|
||||
* so that we can consume any remaining space.
|
||||
*/
|
||||
if (space > 0 && space < SPA_MINBLOCKSIZE)
|
||||
space = SPA_MINBLOCKSIZE;
|
||||
}
|
||||
weight = space;
|
||||
|
||||
/*
|
||||
@ -1160,19 +1472,19 @@ metaslab_weight(metaslab_t *msp)
|
||||
* In effect, this means that we'll select the metaslab with the most
|
||||
* free bandwidth rather than simply the one with the most free space.
|
||||
*/
|
||||
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
|
||||
ASSERT(weight >= space && weight <= 2 * space);
|
||||
if (metaslab_lba_weighting_enabled) {
|
||||
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
|
||||
ASSERT(weight >= space && weight <= 2 * space);
|
||||
}
|
||||
|
||||
msp->ms_factor = metaslab_weight_factor(msp);
|
||||
if (metaslab_weight_factor_enable)
|
||||
weight += msp->ms_factor;
|
||||
|
||||
if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
|
||||
/*
|
||||
* If this metaslab is one we're actively using, adjust its
|
||||
* weight to make it preferable to any inactive metaslab so
|
||||
* we'll polish it off.
|
||||
*/
|
||||
/*
|
||||
* If this metaslab is one we're actively using, adjust its
|
||||
* weight to make it preferable to any inactive metaslab so
|
||||
* we'll polish it off. If the fragmentation on this metaslab
|
||||
* has exceed our threshold, then don't mark it active.
|
||||
*/
|
||||
if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
|
||||
msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
|
||||
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
|
||||
}
|
||||
|
||||
@ -1257,9 +1569,16 @@ metaslab_group_preload(metaslab_group_t *mg)
|
||||
while (msp != NULL) {
|
||||
metaslab_t *msp_next = AVL_NEXT(t, msp);
|
||||
|
||||
/* If we have reached our preload limit then we're done */
|
||||
if (++m > metaslab_preload_limit)
|
||||
break;
|
||||
/*
|
||||
* We preload only the maximum number of metaslabs specified
|
||||
* by metaslab_preload_limit. If a metaslab is being forced
|
||||
* to condense then we preload it too. This will ensure
|
||||
* that force condensing happens in the next txg.
|
||||
*/
|
||||
if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
|
||||
msp = msp_next;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We must drop the metaslab group lock here to preserve
|
||||
@ -1327,11 +1646,12 @@ metaslab_should_condense(metaslab_t *msp)
|
||||
|
||||
/*
|
||||
* Use the ms_size_tree range tree, which is ordered by size, to
|
||||
* obtain the largest segment in the free tree. If the tree is empty
|
||||
* then we should condense the map.
|
||||
* obtain the largest segment in the free tree. We always condense
|
||||
* metaslabs that are empty and metaslabs for which a condense
|
||||
* request has been made.
|
||||
*/
|
||||
rs = avl_last(&msp->ms_size_tree);
|
||||
if (rs == NULL)
|
||||
if (rs == NULL || msp->ms_condense_wanted)
|
||||
return (B_TRUE);
|
||||
|
||||
/*
|
||||
@ -1372,9 +1692,14 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
|
||||
ASSERT3U(spa_sync_pass(spa), ==, 1);
|
||||
ASSERT(msp->ms_loaded);
|
||||
|
||||
|
||||
spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
|
||||
"smp size %llu, segments %lu", txg, msp->ms_id, msp,
|
||||
space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
|
||||
"smp size %llu, segments %lu, forcing condense=%s", txg,
|
||||
msp->ms_id, msp, space_map_length(msp->ms_sm),
|
||||
avl_numnodes(&msp->ms_tree->rt_root),
|
||||
msp->ms_condense_wanted ? "TRUE" : "FALSE");
|
||||
|
||||
msp->ms_condense_wanted = B_FALSE;
|
||||
|
||||
/*
|
||||
* Create an range tree that is 100% allocated. We remove segments
|
||||
@ -1467,8 +1792,14 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
ASSERT3P(*freetree, !=, NULL);
|
||||
ASSERT3P(*freed_tree, !=, NULL);
|
||||
|
||||
/*
|
||||
* Normally, we don't want to process a metaslab if there
|
||||
* are no allocations or frees to perform. However, if the metaslab
|
||||
* is being forced to condense we need to let it through.
|
||||
*/
|
||||
if (range_tree_space(alloctree) == 0 &&
|
||||
range_tree_space(*freetree) == 0)
|
||||
range_tree_space(*freetree) == 0 &&
|
||||
!msp->ms_condense_wanted)
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -1505,8 +1836,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
|
||||
}
|
||||
|
||||
range_tree_vacate(alloctree, NULL, NULL);
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
metaslab_group_histogram_remove(mg, msp);
|
||||
if (msp->ms_loaded) {
|
||||
/*
|
||||
* When the space map is loaded, we have an accruate
|
||||
@ -1526,6 +1858,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
*/
|
||||
space_map_histogram_add(msp->ms_sm, *freetree, tx);
|
||||
}
|
||||
metaslab_group_histogram_add(mg, msp);
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
|
||||
/*
|
||||
* For sync pass 1, we avoid traversing this txg's free range tree
|
||||
@ -1538,6 +1873,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
} else {
|
||||
range_tree_vacate(*freetree, range_tree_add, *freed_tree);
|
||||
}
|
||||
range_tree_vacate(alloctree, NULL, NULL);
|
||||
|
||||
ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
|
||||
ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
|
||||
@ -1648,13 +1984,13 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
|
||||
metaslab_group_sort(mg, msp, metaslab_weight(msp));
|
||||
mutex_exit(&msp->ms_lock);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_sync_reassess(metaslab_group_t *mg)
|
||||
{
|
||||
metaslab_group_alloc_update(mg);
|
||||
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
|
||||
|
||||
/*
|
||||
* Preload the next potential metaslabs
|
||||
@ -1916,9 +2252,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
*/
|
||||
if ((vd->vdev_stat.vs_write_errors > 0 ||
|
||||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
|
||||
d == 0 && dshift == 3 &&
|
||||
!(zfs_write_to_degraded && vd->vdev_state ==
|
||||
VDEV_STATE_DEGRADED)) {
|
||||
d == 0 && dshift == 3 && vd->vdev_children == 0) {
|
||||
all_zero = B_FALSE;
|
||||
goto next;
|
||||
}
|
||||
@ -1943,7 +2277,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
* over- or under-used relative to the pool,
|
||||
* and set an allocation bias to even it out.
|
||||
*/
|
||||
if (mc->mc_aliquot == 0) {
|
||||
if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
|
||||
vdev_stat_t *vs = &vd->vdev_stat;
|
||||
int64_t vu, cu;
|
||||
|
||||
@ -1965,6 +2299,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
*/
|
||||
mg->mg_bias = ((cu - vu) *
|
||||
(int64_t)mg->mg_aliquot) / 100;
|
||||
} else if (!metaslab_bias_enabled) {
|
||||
mg->mg_bias = 0;
|
||||
}
|
||||
|
||||
if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
|
||||
|
@ -81,6 +81,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit64(size) - 1;
|
||||
|
||||
ASSERT(size != 0);
|
||||
ASSERT3U(idx, <,
|
||||
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
|
||||
|
||||
@ -95,6 +96,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
|
||||
uint64_t size = rs->rs_end - rs->rs_start;
|
||||
int idx = highbit64(size) - 1;
|
||||
|
||||
ASSERT(size != 0);
|
||||
ASSERT3U(idx, <,
|
||||
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
|
||||
|
||||
|
@ -194,12 +194,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
|
||||
{
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
dsl_pool_t *pool = spa->spa_dsl_pool;
|
||||
uint64_t size;
|
||||
uint64_t alloc;
|
||||
uint64_t space;
|
||||
uint64_t cap, version;
|
||||
uint64_t size, alloc, cap, version;
|
||||
zprop_source_t src = ZPROP_SRC_NONE;
|
||||
spa_config_dirent_t *dp;
|
||||
metaslab_class_t *mc = spa_normal_class(spa);
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
|
||||
|
||||
@ -212,14 +210,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
|
||||
size - alloc, src);
|
||||
|
||||
space = 0;
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
space += tvd->vdev_max_asize - tvd->vdev_asize;
|
||||
}
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
|
||||
src);
|
||||
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
|
||||
metaslab_class_fragmentation(mc), src);
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
|
||||
metaslab_class_expandable_space(mc), src);
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
|
||||
(spa_mode(spa) == FREAD), src);
|
||||
|
||||
|
@ -202,10 +202,10 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
|
||||
* reached the maximum bucket size. Accumulate all ranges
|
||||
* larger than the max bucket size into the last bucket.
|
||||
*/
|
||||
if (idx < SPACE_MAP_HISTOGRAM_SIZE(sm) - 1) {
|
||||
if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
|
||||
ASSERT3U(idx + sm->sm_shift, ==, i);
|
||||
idx++;
|
||||
ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE(sm));
|
||||
ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_METASLAB_H
|
||||
@ -38,23 +38,22 @@ extern "C" {
|
||||
|
||||
typedef struct metaslab_ops {
|
||||
uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
|
||||
boolean_t (*msop_fragmented)(metaslab_t *msp);
|
||||
} metaslab_ops_t;
|
||||
|
||||
extern metaslab_ops_t *zfs_metaslab_ops;
|
||||
|
||||
metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id,
|
||||
uint64_t object, uint64_t txg);
|
||||
void metaslab_fini(metaslab_t *msp);
|
||||
metaslab_t *metaslab_init(metaslab_group_t *, uint64_t,
|
||||
uint64_t, uint64_t);
|
||||
void metaslab_fini(metaslab_t *);
|
||||
|
||||
void metaslab_load_wait(metaslab_t *msp);
|
||||
int metaslab_load(metaslab_t *msp);
|
||||
void metaslab_unload(metaslab_t *msp);
|
||||
void metaslab_load_wait(metaslab_t *);
|
||||
int metaslab_load(metaslab_t *);
|
||||
void metaslab_unload(metaslab_t *);
|
||||
|
||||
void metaslab_sync(metaslab_t *msp, uint64_t txg);
|
||||
void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
|
||||
void metaslab_sync_reassess(metaslab_group_t *mg);
|
||||
uint64_t metaslab_block_maxsize(metaslab_t *msp);
|
||||
void metaslab_sync(metaslab_t *, uint64_t);
|
||||
void metaslab_sync_done(metaslab_t *, uint64_t);
|
||||
void metaslab_sync_reassess(metaslab_group_t *);
|
||||
uint64_t metaslab_block_maxsize(metaslab_t *);
|
||||
|
||||
#define METASLAB_HINTBP_FAVOR 0x0
|
||||
#define METASLAB_HINTBP_AVOID 0x1
|
||||
@ -62,28 +61,34 @@ uint64_t metaslab_block_maxsize(metaslab_t *msp);
|
||||
#define METASLAB_GANG_CHILD 0x4
|
||||
#define METASLAB_GANG_AVOID 0x8
|
||||
|
||||
int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
|
||||
void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now);
|
||||
int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
|
||||
void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
|
||||
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
|
||||
blkptr_t *, int, uint64_t, blkptr_t *, int);
|
||||
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
|
||||
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
|
||||
void metaslab_check_free(spa_t *, const blkptr_t *);
|
||||
|
||||
metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops);
|
||||
void metaslab_class_destroy(metaslab_class_t *mc);
|
||||
int metaslab_class_validate(metaslab_class_t *mc);
|
||||
metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
|
||||
void metaslab_class_destroy(metaslab_class_t *);
|
||||
int metaslab_class_validate(metaslab_class_t *);
|
||||
void metaslab_class_histogram_verify(metaslab_class_t *);
|
||||
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
|
||||
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
|
||||
|
||||
void metaslab_class_space_update(metaslab_class_t *mc,
|
||||
int64_t alloc_delta, int64_t defer_delta,
|
||||
int64_t space_delta, int64_t dspace_delta);
|
||||
uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_space(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
|
||||
uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
|
||||
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
|
||||
int64_t, int64_t);
|
||||
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_space(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_deferred(metaslab_class_t *);
|
||||
|
||||
metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd);
|
||||
void metaslab_group_destroy(metaslab_group_t *mg);
|
||||
void metaslab_group_activate(metaslab_group_t *mg);
|
||||
void metaslab_group_passivate(metaslab_group_t *mg);
|
||||
metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
|
||||
void metaslab_group_destroy(metaslab_group_t *);
|
||||
void metaslab_group_activate(metaslab_group_t *);
|
||||
void metaslab_group_passivate(metaslab_group_t *);
|
||||
uint64_t metaslab_group_get_space(metaslab_group_t *);
|
||||
void metaslab_group_histogram_verify(metaslab_group_t *);
|
||||
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
|
||||
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -41,6 +41,23 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* A metaslab class encompasses a category of allocatable top-level vdevs.
|
||||
* Each top-level vdev is associated with a metaslab group which defines
|
||||
* the allocatable region for that vdev. Examples of these categories include
|
||||
* "normal" for data block allocations (i.e. main pool allocations) or "log"
|
||||
* for allocations designated for intent log devices (i.e. slog devices).
|
||||
* When a block allocation is requested from the SPA it is associated with a
|
||||
* metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
|
||||
* to the class can be used to satisfy that request. Allocations are done
|
||||
* by traversing the metaslab groups that are linked off of the mc_rotor field.
|
||||
* This rotor points to the next metaslab group where allocations will be
|
||||
* attempted. Allocating a block is a 3 step process -- select the metaslab
|
||||
* group, select the metaslab, and then allocate the block. The metaslab
|
||||
* class defines the low-level block allocator that will be used as the
|
||||
* final step in allocation. These allocators are pluggable allowing each class
|
||||
* to use a block allocator that best suits that class.
|
||||
*/
|
||||
struct metaslab_class {
|
||||
spa_t *mc_spa;
|
||||
metaslab_group_t *mc_rotor;
|
||||
@ -51,8 +68,18 @@ struct metaslab_class {
|
||||
uint64_t mc_deferred; /* total deferred frees */
|
||||
uint64_t mc_space; /* total space (alloc + free) */
|
||||
uint64_t mc_dspace; /* total deflated space */
|
||||
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
|
||||
};
|
||||
|
||||
/*
|
||||
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
|
||||
* of a top-level vdev. They are linked togther to form a circular linked
|
||||
* list and can belong to only one metaslab class. Metaslab groups may become
|
||||
* ineligible for allocations for a number of reasons such as limited free
|
||||
* space, fragmentation, or going offline. When this happens the allocator will
|
||||
* simply find the next metaslab group in the linked list and attempt
|
||||
* to allocate from that group instead.
|
||||
*/
|
||||
struct metaslab_group {
|
||||
kmutex_t mg_lock;
|
||||
avl_tree_t mg_metaslab_tree;
|
||||
@ -66,12 +93,14 @@ struct metaslab_group {
|
||||
taskq_t *mg_taskq;
|
||||
metaslab_group_t *mg_prev;
|
||||
metaslab_group_t *mg_next;
|
||||
uint64_t mg_fragmentation;
|
||||
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
|
||||
};
|
||||
|
||||
/*
|
||||
* This value defines the number of elements in the ms_lbas array. The value
|
||||
* of 64 was chosen as it covers to cover all power of 2 buckets up to
|
||||
* UINT64_MAX. This is the equivalent of highbit(UINT64_MAX).
|
||||
* of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
|
||||
* This is the equivalent of highbit(UINT64_MAX).
|
||||
*/
|
||||
#define MAX_LBAS 64
|
||||
|
||||
@ -134,6 +163,7 @@ struct metaslab {
|
||||
uint64_t ms_id;
|
||||
uint64_t ms_start;
|
||||
uint64_t ms_size;
|
||||
uint64_t ms_fragmentation;
|
||||
|
||||
range_tree_t *ms_alloctree[TXG_SIZE];
|
||||
range_tree_t *ms_freetree[TXG_SIZE];
|
||||
@ -141,12 +171,12 @@ struct metaslab {
|
||||
range_tree_t *ms_tree;
|
||||
|
||||
boolean_t ms_condensing; /* condensing? */
|
||||
boolean_t ms_condense_wanted;
|
||||
boolean_t ms_loaded;
|
||||
boolean_t ms_loading;
|
||||
|
||||
int64_t ms_deferspace; /* sum of ms_defermap[] space */
|
||||
uint64_t ms_weight; /* weight vs. others in group */
|
||||
uint64_t ms_factor;
|
||||
uint64_t ms_access_txg;
|
||||
|
||||
/*
|
||||
|
@ -24,7 +24,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPACE_MAP_H
|
||||
@ -44,9 +44,7 @@ extern "C" {
|
||||
* maintain backward compatibility.
|
||||
*/
|
||||
#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
|
||||
#define SPACE_MAP_HISTOGRAM_SIZE(sm) \
|
||||
(sizeof ((sm)->sm_phys->smp_histogram) / \
|
||||
sizeof ((sm)->sm_phys->smp_histogram[0]))
|
||||
#define SPACE_MAP_HISTOGRAM_SIZE 32
|
||||
|
||||
/*
|
||||
* The space_map_phys is the on-disk representation of the space map.
|
||||
@ -68,7 +66,7 @@ typedef struct space_map_phys {
|
||||
* whose size is:
|
||||
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
|
||||
*/
|
||||
uint64_t smp_histogram[32]; /* histogram of free space */
|
||||
uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
|
||||
} space_map_phys_t;
|
||||
|
||||
/*
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_ZFS_DEBUG_H
|
||||
@ -50,13 +50,14 @@ extern int zfs_flags;
|
||||
extern boolean_t zfs_recover;
|
||||
extern boolean_t zfs_free_leak_on_eio;
|
||||
|
||||
#define ZFS_DEBUG_DPRINTF (1<<0)
|
||||
#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
|
||||
#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
|
||||
#define ZFS_DEBUG_SNAPNAMES (1<<3)
|
||||
#define ZFS_DEBUG_MODIFY (1<<4)
|
||||
#define ZFS_DEBUG_SPA (1<<5)
|
||||
#define ZFS_DEBUG_ZIO_FREE (1<<6)
|
||||
#define ZFS_DEBUG_DPRINTF (1<<0)
|
||||
#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
|
||||
#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
|
||||
#define ZFS_DEBUG_SNAPNAMES (1<<3)
|
||||
#define ZFS_DEBUG_MODIFY (1<<4)
|
||||
#define ZFS_DEBUG_SPA (1<<5)
|
||||
#define ZFS_DEBUG_ZIO_FREE (1<<6)
|
||||
#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7)
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
extern void __dprintf(const char *file, const char *func,
|
||||
|
@ -2116,6 +2116,11 @@ vdev_remove(vdev_t *vd, uint64_t txg)
|
||||
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
@ -2123,12 +2128,27 @@ vdev_remove(vdev_t *vd, uint64_t txg)
|
||||
continue;
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
/*
|
||||
* If the metaslab was not loaded when the vdev
|
||||
* was removed then the histogram accounting may
|
||||
* not be accurate. Update the histogram information
|
||||
* here so that we ensure that the metaslab group
|
||||
* and metaslab class are up-to-date.
|
||||
*/
|
||||
metaslab_group_histogram_remove(mg, msp);
|
||||
|
||||
VERIFY0(space_map_allocated(msp->ms_sm));
|
||||
space_map_free(msp->ms_sm, tx);
|
||||
space_map_close(msp->ms_sm);
|
||||
msp->ms_sm = NULL;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
|
||||
ASSERT0(mg->mg_histogram[i]);
|
||||
|
||||
}
|
||||
|
||||
if (vd->vdev_ms_array) {
|
||||
@ -2580,7 +2600,10 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
|
||||
void
|
||||
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
|
||||
{
|
||||
vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
|
||||
|
||||
mutex_enter(&vd->vdev_stat_lock);
|
||||
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
|
||||
@ -2590,7 +2613,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
|
||||
if (vd->vdev_ops->vdev_op_leaf)
|
||||
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
|
||||
vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
if (vd->vdev_aux == NULL && vd == vd->vdev_top)
|
||||
vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
|
||||
|
||||
/*
|
||||
* If we're getting stats on the root vdev, aggregate the I/O counts
|
||||
@ -2601,15 +2625,14 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
|
||||
vdev_t *cvd = rvd->vdev_child[c];
|
||||
vdev_stat_t *cvs = &cvd->vdev_stat;
|
||||
|
||||
mutex_enter(&vd->vdev_stat_lock);
|
||||
for (int t = 0; t < ZIO_TYPES; t++) {
|
||||
vs->vs_ops[t] += cvs->vs_ops[t];
|
||||
vs->vs_bytes[t] += cvs->vs_bytes[t];
|
||||
}
|
||||
cvs->vs_scan_removing = cvd->vdev_removing;
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
}
|
||||
}
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -190,6 +190,7 @@ typedef enum {
|
||||
ZPOOL_PROP_COMMENT,
|
||||
ZPOOL_PROP_EXPANDSZ,
|
||||
ZPOOL_PROP_FREEING,
|
||||
ZPOOL_PROP_FRAGMENTATION,
|
||||
ZPOOL_PROP_LEAKED,
|
||||
ZPOOL_NUM_PROPS
|
||||
} zpool_prop_t;
|
||||
@ -588,6 +589,13 @@ typedef struct zpool_rewind_policy {
|
||||
*/
|
||||
#define SPA_MINDEVSIZE (64ULL << 20)
|
||||
|
||||
/*
|
||||
* Set if the fragmentation has not yet been calculated. This can happen
|
||||
* because the space maps have not been upgraded or the histogram feature
|
||||
* is not enabled.
|
||||
*/
|
||||
#define ZFS_FRAG_INVALID UINT64_MAX
|
||||
|
||||
/*
|
||||
* The location of the pool configuration repository, shared between kernel and
|
||||
* userland.
|
||||
@ -725,6 +733,7 @@ typedef struct vdev_stat {
|
||||
uint64_t vs_self_healed; /* self-healed bytes */
|
||||
uint64_t vs_scan_removing; /* removing? */
|
||||
uint64_t vs_scan_processed; /* scan processed bytes */
|
||||
uint64_t vs_fragmentation; /* device fragmentation */
|
||||
} vdev_stat_t;
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user