1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-03 09:00:21 +00:00

MFV r354383: 10592 misc. metaslab and vdev related ZoL bug fixes

illumos/illumos-gate@555d674d5d
555d674d5d

https://www.illumos.org/issues/10592
  This is a collection of recent fixes from ZoL:
  8eef997679 Error path in metaslab_load_impl() forgets to drop ms_sync_lock
  928e8ad47d Introduce auxiliary metaslab histograms
  425d3237ee Get rid of space_map_update() for ms_synced_length
  6c926f426a Simplify log vdev removal code
  21e7cf5da8 zdb -L should skip leak detection altogether
  df72b8bebe Rename range_tree_verify to range_tree_verify_not_present
  75058f3303 Remove unused vdev_t fields

Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Author: Serapheim Dimitropoulos <serapheim@delphix.com>
MFC after:	4 weeks
This commit is contained in:
Andriy Gapon 2019-11-21 13:35:43 +00:00
commit 8491540808
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=354948
16 changed files with 772 additions and 447 deletions

View File

@ -10,7 +10,7 @@
.\"
.\"
.\" Copyright 2012, Richard Lowe.
.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\"
.Dd October 06, 2017
@ -187,7 +187,7 @@ If the
.Fl u
option is also specified, also display the uberblocks on this device.
.It Fl L
Disable leak tracing and the loading of space maps.
Disable leak detection and the loading of space maps.
By default,
.Nm
verifies that all non-free blocks are referenced, which can be very expensive.

View File

@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm)
return;
(void) printf("space map object %llu:\n",
(longlong_t)sm->sm_phys->smp_object);
(void) printf(" smp_objsize = 0x%llx\n",
(longlong_t)sm->sm_phys->smp_objsize);
(longlong_t)sm->sm_object);
(void) printf(" smp_length = 0x%llx\n",
(longlong_t)sm->sm_phys->smp_length);
(void) printf(" smp_alloc = 0x%llx\n",
(longlong_t)sm->sm_phys->smp_alloc);
if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
return;
/*
* Print out the freelist entries in both encoded and decoded form.
*/
uint8_t mapshift = sm->sm_shift;
int64_t alloc = 0;
uint64_t word;
uint64_t word, entry_id = 0;
for (uint64_t offset = 0; offset < space_map_length(sm);
offset += sizeof (word)) {
@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm)
sizeof (word), &word, DMU_READ_PREFETCH));
if (sm_entry_is_debug(word)) {
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
(u_longlong_t)(offset / sizeof (word)),
(void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
(u_longlong_t)entry_id,
ddata[SM_DEBUG_ACTION_DECODE(word)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(word),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
entry_id++;
continue;
}
@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)
(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
(u_longlong_t)(offset / sizeof (word)),
(u_longlong_t)entry_id,
entry_type, (u_longlong_t)entry_off,
(u_longlong_t)(entry_off + entry_run),
(u_longlong_t)entry_run,
@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm)
alloc += entry_run;
else
alloc -= entry_run;
entry_id++;
}
if ((uint64_t)alloc != space_map_allocated(sm)) {
if (alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%lld) INCONSISTENT "
"with space map summary (%lld)\n",
(longlong_t)space_map_allocated(sm), (longlong_t)alloc);
@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
}
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
}
static void
@ -3140,6 +3142,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
ddt_entry_t dde;
int error;
ASSERT(!dump_opt['L']);
bzero(&ddb, sizeof (ddb));
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
@ -3163,12 +3167,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
zcb->zcb_dedup_blocks++;
}
}
if (!dump_opt['L']) {
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
}
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
}
ASSERT(error == ENOENT);
@ -3210,6 +3212,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
static void
zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
{
if (dump_opt['L'])
return;
if (spa->spa_vdev_removal == NULL)
return;
@ -3301,7 +3306,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
space_map_t *prev_obsolete_sm = NULL;
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
space_map_update(prev_obsolete_sm);
vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
prev_obsolete_sm);
space_map_close(prev_obsolete_sm);
@ -3395,9 +3399,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
space_map_update(checkpoint_sm);
VERIFY0(space_map_iterate(checkpoint_sm,
space_map_length(checkpoint_sm),
checkpoint_sm_exclude_entry_cb, &cseea));
space_map_close(checkpoint_sm);
@ -3407,6 +3411,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
static void
zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
{
ASSERT(!dump_opt['L']);
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
@ -3503,6 +3509,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
static void
zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
{
ASSERT(!dump_opt['L']);
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
@ -3549,67 +3557,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
zcb->zcb_spa = spa;
if (!dump_opt['L']) {
dsl_pool_t *dp = spa->spa_dsl_pool;
vdev_t *rvd = spa->spa_root_vdev;
if (dump_opt['L'])
return;
/*
* We are going to be changing the meaning of the metaslab's
* ms_allocatable. Ensure that the allocator doesn't try to
* use the tree.
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
dsl_pool_t *dp = spa->spa_dsl_pool;
vdev_t *rvd = spa->spa_root_vdev;
zcb->zcb_vd_obsolete_counts =
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
UMEM_NOFAIL);
/*
* We are going to be changing the meaning of the metaslab's
* ms_allocatable. Ensure that the allocator doesn't try to
* use the tree.
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
/*
* For leak detection, we overload the ms_allocatable trees
* to contain allocated segments instead of free segments.
* As a result, we can't use the normal metaslab_load/unload
* interfaces.
*/
zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
zcb->zcb_vd_obsolete_counts =
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
UMEM_NOFAIL);
/*
* On load_concrete_ms_allocatable_trees() we loaded all the
* allocated entries from the ms_sm to the ms_allocatable for
* each metaslab. If the pool has a checkpoint or is in the
* middle of discarding a checkpoint, some of these blocks
* may have been freed but their ms_sm may not have been
* updated because they are referenced by the checkpoint. In
* order to avoid false-positives during leak-detection, we
* go through the vdev's checkpoint space map and exclude all
* its entries from their relevant ms_allocatable.
*
* We also aggregate the space held by the checkpoint and add
* it to zcb_checkpoint_size.
*
* Note that at this point we are also verifying that all the
* entries on the checkpoint_sm are marked as allocated in
* the ms_sm of their relevant metaslab.
* [see comment in checkpoint_sm_exclude_entry_cb()]
*/
zdb_leak_init_exclude_checkpoint(spa, zcb);
/*
* For leak detection, we overload the ms_allocatable trees
* to contain allocated segments instead of free segments.
* As a result, we can't use the normal metaslab_load/unload
* interfaces.
*/
zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
/* for cleaner progress output */
(void) fprintf(stderr, "\n");
/*
* On load_concrete_ms_allocatable_trees() we loaded all the
* allocated entries from the ms_sm to the ms_allocatable for
* each metaslab. If the pool has a checkpoint or is in the
* middle of discarding a checkpoint, some of these blocks
* may have been freed but their ms_sm may not have been
* updated because they are referenced by the checkpoint. In
* order to avoid false-positives during leak-detection, we
* go through the vdev's checkpoint space map and exclude all
* its entries from their relevant ms_allocatable.
*
* We also aggregate the space held by the checkpoint and add
* it to zcb_checkpoint_size.
*
* Note that at this point we are also verifying that all the
* entries on the checkpoint_sm are marked as allocated in
* the ms_sm of their relevant metaslab.
* [see comment in checkpoint_sm_exclude_entry_cb()]
*/
zdb_leak_init_exclude_checkpoint(spa, zcb);
ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
ASSERT(spa_feature_is_enabled(spa,
SPA_FEATURE_DEVICE_REMOVAL));
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
increment_indirect_mapping_cb, zcb, NULL);
}
} else {
/*
* If leak tracing is disabled, we still need to consider
* any checkpointed space in our space verification.
*/
zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
/* for cleaner progress output */
(void) fprintf(stderr, "\n");
if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
ASSERT(spa_feature_is_enabled(spa,
SPA_FEATURE_DEVICE_REMOVAL));
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
increment_indirect_mapping_cb, zcb, NULL);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@ -3690,52 +3694,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
static boolean_t
zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
{
boolean_t leaks = B_FALSE;
if (!dump_opt['L']) {
vdev_t *rvd = spa->spa_root_vdev;
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
metaslab_group_t *mg = vd->vdev_mg;
if (dump_opt['L'])
return (B_FALSE);
if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
boolean_t leaks = B_FALSE;
vdev_t *rvd = spa->spa_root_vdev;
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
#if DEBUG
metaslab_group_t *mg = vd->vdev_mg;
#endif
if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
}
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT3P(mg, ==, msp->ms_group);
/*
* ms_allocatable has been overloaded
* to contain allocated segments. Now that
* we finished traversing all blocks, any
* block that remains in the ms_allocatable
* represents an allocated block that we
* did not claim during the traversal.
* Claimed blocks would have been removed
* from the ms_allocatable. For indirect
* vdevs, space remaining in the tree
* represents parts of the mapping that are
* not referenced, which is not a bug.
*/
if (vd->vdev_ops == &vdev_indirect_ops) {
range_tree_vacate(msp->ms_allocatable,
NULL, NULL);
} else {
range_tree_vacate(msp->ms_allocatable,
zdb_leak, vd);
}
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT3P(mg, ==, msp->ms_group);
/*
* ms_allocatable has been overloaded
* to contain allocated segments. Now that
* we finished traversing all blocks, any
* block that remains in the ms_allocatable
* represents an allocated block that we
* did not claim during the traversal.
* Claimed blocks would have been removed
* from the ms_allocatable. For indirect
* vdevs, space remaining in the tree
* represents parts of the mapping that are
* not referenced, which is not a bug.
*/
if (vd->vdev_ops == &vdev_indirect_ops) {
range_tree_vacate(msp->ms_allocatable,
NULL, NULL);
} else {
range_tree_vacate(msp->ms_allocatable,
zdb_leak, vd);
}
if (msp->ms_loaded) {
msp->ms_loaded = B_FALSE;
}
if (msp->ms_loaded) {
msp->ms_loaded = B_FALSE;
}
}
umem_free(zcb->zcb_vd_obsolete_counts,
rvd->vdev_children * sizeof (uint32_t *));
zcb->zcb_vd_obsolete_counts = NULL;
}
umem_free(zcb->zcb_vd_obsolete_counts,
rvd->vdev_children * sizeof (uint32_t *));
zcb->zcb_vd_obsolete_counts = NULL;
return (leaks);
}
@ -3774,13 +3784,18 @@ dump_block_stats(spa_t *spa)
!dump_opt['L'] ? "nothing leaked " : "");
/*
* Load all space maps as SM_ALLOC maps, then traverse the pool
* claiming each block we discover. If the pool is perfectly
* consistent, the space maps will be empty when we're done.
* Anything left over is a leak; any block we can't claim (because
* it's not part of any space map) is a double allocation,
* reference to a freed block, or an unclaimed log block.
* When leak detection is enabled we load all space maps as SM_ALLOC
* maps, then traverse the pool claiming each block we discover. If
* the pool is perfectly consistent, the segment trees will be empty
* when we're done. Anything left over is a leak; any block we can't
* claim (because it's not part of any space map) is a double
* allocation, reference to a freed block, or an unclaimed log block.
*
* When leak detection is disabled (-L option) we still traverse the
* pool claiming each block we discover, but we skip opening any space
* maps.
*/
bzero(&zcb, sizeof (zdb_cb_t));
zdb_leak_init(spa, &zcb);
/*
@ -3859,11 +3874,10 @@ dump_block_stats(spa_t *spa)
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
if (total_found == total_alloc) {
if (!dump_opt['L'])
(void) printf("\n\tNo leaks (block sum matches space"
" maps exactly)\n");
} else {
if (total_found == total_alloc && !dump_opt['L']) {
(void) printf("\n\tNo leaks (block sum matches space"
" maps exactly)\n");
} else if (!dump_opt['L']) {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
(u_longlong_t)total_found,
@ -4203,7 +4217,6 @@ verify_device_removal_feature_counts(spa_t *spa)
spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object,
0, vd->vdev_asize, 0));
space_map_update(prev_obsolete_sm);
dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
(void) printf("\n");
space_map_close(prev_obsolete_sm);
@ -4409,7 +4422,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
* their respective ms_allocateable trees should not contain them.
*/
mutex_enter(&ms->ms_lock);
range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
range_tree_verify_not_present(ms->ms_allocatable,
sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);
return (0);
@ -4472,7 +4486,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
checkpoint_sm_obj, 0, current_vd->vdev_asize,
current_vd->vdev_ashift));
space_map_update(checkpoint_sm);
verify_checkpoint_sm_entry_cb_arg_t vcsec;
vcsec.vcsec_vd = ckpoint_vd;
@ -4480,6 +4493,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
vcsec.vcsec_num_entries =
space_map_length(checkpoint_sm) / sizeof (uint64_t);
VERIFY0(space_map_iterate(checkpoint_sm,
space_map_length(checkpoint_sm),
verify_checkpoint_sm_entry_cb, &vcsec));
dump_spacemap(current->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
@ -4559,7 +4573,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
* are part of the checkpoint were freed by mistake.
*/
range_tree_walk(ckpoint_msp->ms_allocatable,
(range_tree_func_t *)range_tree_verify,
(range_tree_func_t *)range_tree_verify_not_present,
current_msp->ms_allocatable);
}
}
@ -4571,6 +4585,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
static void
verify_checkpoint_blocks(spa_t *spa)
{
ASSERT(!dump_opt['L']);
spa_t *checkpoint_spa;
char *checkpoint_pool;
nvlist_t *config = NULL;
@ -4636,7 +4652,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
space_map_update(checkpoint_sm);
dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
}

View File

@ -584,45 +584,62 @@ metaslab_compare(const void *x1, const void *x2)
return (AVL_CMP(m1->ms_start, m2->ms_start));
}
uint64_t
metaslab_allocated_space(metaslab_t *msp)
{
return (msp->ms_allocated_space);
}
/*
* Verify that the space accounting on disk matches the in-core range_trees.
*/
void
static void
metaslab_verify_space(metaslab_t *msp, uint64_t txg)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
uint64_t allocated = 0;
uint64_t allocating = 0;
uint64_t sm_free_space, msp_free_space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!msp->ms_condensing);
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/*
* We can only verify the metaslab space when we're called
* from syncing context with a loaded metaslab that has an allocated
* space map. Calling this in non-syncing context does not
* provide a consistent view of the metaslab since we're performing
* allocations in the future.
* from syncing context with a loaded metaslab that has an
* allocated space map. Calling this in non-syncing context
* does not provide a consistent view of the metaslab since
* we're performing allocations in the future.
*/
if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
!msp->ms_loaded)
return;
sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
space_map_alloc_delta(msp->ms_sm);
/*
* Even though the smp_alloc field can get negative (e.g.
* see vdev_checkpoint_sm), that should never be the case
* when it come's to a metaslab's space map.
*/
ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
/*
* Account for future allocations since we would have already
* deducted that space from the ms_freetree.
* Account for future allocations since we would have
* already deducted that space from the ms_allocatable.
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
allocated +=
allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
ASSERT3U(msp->ms_deferspace, ==,
range_tree_space(msp->ms_defer[0]) +
range_tree_space(msp->ms_defer[1]));
msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
msp->ms_deferspace + range_tree_space(msp->ms_freed);
VERIFY3U(sm_free_space, ==, msp_free_space);
@ -929,6 +946,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT(msp != NULL);
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
@ -1470,6 +1488,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
* ==========================================================================
*/
static void
metaslab_aux_histograms_clear(metaslab_t *msp)
{
/*
* Auxiliary histograms are only cleared when resetting them,
* which can only happen while the metaslab is loaded.
*/
ASSERT(msp->ms_loaded);
bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
for (int t = 0; t < TXG_DEFER_SIZE; t++)
bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
}
static void
metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
range_tree_t *rt)
{
/*
* This is modeled after space_map_histogram_add(), so refer to that
* function for implementation details. We want this to work like
* the space map histogram, and not the range tree histogram, as we
* are essentially constructing a delta that will be later subtracted
* from the space map histogram.
*/
int idx = 0;
for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
ASSERT3U(i, >=, idx + shift);
histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
ASSERT3U(idx + shift, ==, i);
idx++;
ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
}
}
}
/*
* Called at every sync pass that the metaslab gets synced.
*
* The reason is that we want our auxiliary histograms to be updated
* wherever the metaslab's space map histogram is updated. This way
* we stay consistent on which parts of the metaslab space map's
* histogram are currently not available for allocations (e.g because
* they are in the defer, freed, and freeing trees).
*/
static void
metaslab_aux_histograms_update(metaslab_t *msp)
{
space_map_t *sm = msp->ms_sm;
ASSERT(sm != NULL);
/*
* This is similar to the metaslab's space map histogram updates
* that take place in metaslab_sync(). The only difference is that
* we only care about segments that haven't made it into the
* ms_allocatable tree yet.
*/
if (msp->ms_loaded) {
metaslab_aux_histograms_clear(msp);
metaslab_aux_histogram_add(msp->ms_synchist,
sm->sm_shift, msp->ms_freed);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
metaslab_aux_histogram_add(msp->ms_deferhist[t],
sm->sm_shift, msp->ms_defer[t]);
}
}
metaslab_aux_histogram_add(msp->ms_synchist,
sm->sm_shift, msp->ms_freeing);
}
/*
* Called every time we are done syncing (writing to) the metaslab,
* i.e. at the end of each sync pass.
* [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
*/
static void
metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
space_map_t *sm = msp->ms_sm;
if (sm == NULL) {
/*
* We came here from metaslab_init() when creating/opening a
* pool, looking at a metaslab that hasn't had any allocations
* yet.
*/
return;
}
/*
* This is similar to the actions that we take for the ms_freed
* and ms_defer trees in metaslab_sync_done().
*/
uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
if (defer_allowed) {
bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
sizeof (msp->ms_synchist));
} else {
bzero(msp->ms_deferhist[hist_index],
sizeof (msp->ms_deferhist[hist_index]));
}
bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
}
/*
* Ensure that the metaslab's weight and fragmentation are consistent
* with the contents of the histogram (either the range tree's histogram
* or the space map's depending whether the metaslab is loaded).
*/
static void
metaslab_verify_weight_and_frag(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/* see comment in metaslab_verify_unflushed_changes() */
if (msp->ms_group == NULL)
return;
/*
* Devices being removed always return a weight of 0 and leave
* fragmentation and ms_max_size as is - there is nothing for
* us to verify here.
*/
vdev_t *vd = msp->ms_group->mg_vd;
if (vd->vdev_removing)
return;
/*
* If the metaslab is dirty it probably means that we've done
* some allocations or frees that have changed our histograms
* and thus the weight.
*/
for (int t = 0; t < TXG_SIZE; t++) {
if (txg_list_member(&vd->vdev_ms_list, msp, t))
return;
}
/*
* This verification checks that our in-memory state is consistent
* with what's on disk. If the pool is read-only then there aren't
* any changes and we just have the initially-loaded state.
*/
if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
return;
/* some extra verification for in-core tree if you can */
if (msp->ms_loaded) {
range_tree_stat_verify(msp->ms_allocatable);
VERIFY(space_map_histogram_verify(msp->ms_sm,
msp->ms_allocatable));
}
uint64_t weight = msp->ms_weight;
uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
uint64_t frag = msp->ms_fragmentation;
uint64_t max_segsize = msp->ms_max_size;
msp->ms_weight = 0;
msp->ms_fragmentation = 0;
msp->ms_max_size = 0;
/*
* This function is used for verification purposes. Regardless of
* whether metaslab_weight() thinks this metaslab should be active or
* not, we want to ensure that the actual weight (and therefore the
* value of ms_weight) would be the same if it was to be recalculated
* at this point.
*/
msp->ms_weight = metaslab_weight(msp) | was_active;
VERIFY3U(max_segsize, ==, msp->ms_max_size);
/*
* If the weight type changed then there is no point in doing
* verification. Revert fields to their original values.
*/
if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
(!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
msp->ms_fragmentation = frag;
msp->ms_weight = weight;
return;
}
VERIFY3U(msp->ms_fragmentation, ==, frag);
VERIFY3U(msp->ms_weight, ==, weight);
}
/*
* Wait for any in-progress metaslab loads to complete.
*/
@ -1491,47 +1706,94 @@ metaslab_load_impl(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loading);
ASSERT(!msp->ms_condensing);
/*
* Nobody else can manipulate a loading metaslab, so it's now safe
* to drop the lock. This way we don't have to hold the lock while
* reading the spacemap from disk.
* We temporarily drop the lock to unblock other operations while we
* are reading the space map. Therefore, metaslab_sync() and
* metaslab_sync_done() can run at the same time as we do.
*
* metaslab_sync() can append to the space map while we are loading.
* Therefore we load only entries that existed when we started the
* load. Additionally, metaslab_sync_done() has to wait for the load
* to complete because there are potential races like metaslab_load()
* loading parts of the space map that are currently being appended
* by metaslab_sync(). If we didn't, the ms_allocatable would have
* entries that metaslab_sync_done() would try to re-add later.
*
* That's why before dropping the lock we remember the synced length
* of the metaslab and read up to that point of the space map,
* ignoring entries appended by metaslab_sync() that happen after we
* drop the lock.
*/
uint64_t length = msp->ms_synced_length;
mutex_exit(&msp->ms_lock);
/*
* If the space map has not been allocated yet, then treat
* all the space in the metaslab as free and add it to ms_allocatable.
*/
if (msp->ms_sm != NULL) {
error = space_map_load(msp->ms_sm, msp->ms_allocatable,
SM_FREE);
error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
SM_FREE, length);
} else {
/*
* The space map has not been allocated yet, so treat
* all the space in the metaslab as free and add it to the
* ms_allocatable tree.
*/
range_tree_add(msp->ms_allocatable,
msp->ms_start, msp->ms_size);
}
/*
* We need to grab the ms_sync_lock to prevent metaslab_sync() from
* changing the ms_sm and the metaslab's range trees while we are
* about to use them and populate the ms_allocatable. The ms_lock
* is insufficient for this because metaslab_sync() doesn't hold
* the ms_lock while writing the ms_checkpointing tree to disk.
*/
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
ASSERT(!msp->ms_condensing);
if (error != 0)
if (error != 0) {
mutex_exit(&msp->ms_sync_lock);
return (error);
}
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
/*
* If the metaslab already has a spacemap, then we need to
* remove all segments from the defer tree; otherwise, the
* metaslab is completely empty and we can skip this.
* The ms_allocatable contains the segments that exist in the
* ms_defer trees [see ms_synced_length]. Thus we need to remove
* them from ms_allocatable as they will be added again in
* metaslab_sync_done().
*/
if (msp->ms_sm != NULL) {
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, msp->ms_allocatable);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, msp->ms_allocatable);
}
/*
* Call metaslab_recalculate_weight_and_sort() now that the
* metaslab is loaded so we get the metaslab's real weight.
*
* Unless this metaslab was created with older software and
* has not yet been converted to use segment-based weight, we
* expect the new weight to be better or equal to the weight
* that the metaslab had while it was not loaded. This is
* because the old weight does not take into account the
* consolidation of adjacent segments between TXGs. [see
* comment for ms_synchist and ms_deferhist[] for more info]
*/
uint64_t weight = msp->ms_weight;
metaslab_recalculate_weight_and_sort(msp);
if (!WEIGHT_IS_SPACEBASED(weight))
ASSERT3U(weight, <=, msp->ms_weight);
msp->ms_max_size = metaslab_block_maxsize(msp);
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
metaslab_verify_space(msp, spa_syncing_txg(spa));
mutex_exit(&msp->ms_sync_lock);
return (0);
}
@ -1548,6 +1810,7 @@ metaslab_load(metaslab_t *msp)
if (msp->ms_loaded)
return (0);
VERIFY(!msp->ms_loading);
ASSERT(!msp->ms_condensing);
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
@ -1561,10 +1824,29 @@ void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
metaslab_verify_weight_and_frag(msp);
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
/*
* We explicitly recalculate the metaslab's weight based on its space
* map (as it is now not loaded). We want unload metaslabs to always
* have their weights calculated from the space map histograms, while
* loaded ones have it calculated from their in-core range tree
* [see metaslab_load()]. This way, the weight reflects the information
* available in-core, whether it is loaded or not
*
* If ms_group == NULL means that we came here from metaslab_fini(),
* at which point it doesn't make sense for us to do the recalculation
* and the sorting.
*/
if (msp->ms_group != NULL)
metaslab_recalculate_weight_and_sort(msp);
}
static void
@ -1604,6 +1886,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
*
* Note:
* When called from vdev_expand(), we can't call into the DMU as
* we are holding the spa_config_lock as a writer and we would
* deadlock [see relevant comment in vdev_metaslab_init()]. in
* that case, the object parameter is zero though, so we won't
* call into the DMU.
*/
if (object != 0) {
error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
@ -1615,14 +1904,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
}
ASSERT(ms->ms_sm != NULL);
ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
/*
* We create the main range tree here, but we don't create the
* We create the ms_allocatable here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
* addition of new space; and for debugging, it ensures that
* we'd data fault on any attempt to use this metaslab before
* it's ready.
*/
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
metaslab_rangesize_compare, 0);
@ -1639,8 +1931,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
* out this txg. This ensures that we don't attempt to allocate
* from it before we have initialized it completely.
*/
if (txg <= TXG_INITIAL)
if (txg <= TXG_INITIAL) {
metaslab_sync_done(ms, 0);
metaslab_space_update(vd, mg->mg_class,
metaslab_allocated_space(ms), 0, 0);
}
/*
* If metaslab_debug_load is set and we're initializing a metaslab
@ -1674,7 +1969,7 @@ metaslab_fini(metaslab_t *msp)
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
metaslab_space_update(vd, mg->mg_class,
-space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
-metaslab_allocated_space(msp), 0, -msp->ms_size);
space_map_close(msp->ms_sm);
@ -1695,6 +1990,9 @@ metaslab_fini(metaslab_t *msp)
range_tree_destroy(msp->ms_checkpointing);
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@ -1710,7 +2008,7 @@ metaslab_fini(metaslab_t *msp)
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
* multiplying that by the fragmetation metric in this table. Doing
* multiplying that by the fragmentation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
@ -1745,10 +2043,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
};
/*
* Calclate the metaslab's fragmentation metric. A return value
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
* not support this metric. Otherwise, the return value should be in the
* range [0, 100].
* Calculate the metaslab's fragmentation metric and set ms_fragmentation.
* Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
* been upgraded and does not support this metric. Otherwise, the return
* value should be in the range [0, 100].
*/
static void
metaslab_set_fragmentation(metaslab_t *msp)
@ -1841,7 +2139,7 @@ metaslab_space_weight(metaslab_t *msp)
/*
* The baseline weight is the metaslab's free space.
*/
space = msp->ms_size - space_map_allocated(msp->ms_sm);
space = msp->ms_size - metaslab_allocated_space(msp);
if (metaslab_fragmentation_factor_enabled &&
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
@ -1945,14 +2243,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
static uint64_t
metaslab_weight_from_spacemap(metaslab_t *msp)
{
uint64_t weight = 0;
space_map_t *sm = msp->ms_sm;
ASSERT(!msp->ms_loaded);
ASSERT(sm != NULL);
ASSERT3U(space_map_object(sm), !=, 0);
ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
/*
* Create a joint histogram from all the segments that have made
* it to the metaslab's space map histogram, that are not yet
* available for allocation because they are still in the freeing
* pipeline (e.g. freeing, freed, and defer trees). Then subtract
* these segments from the space map's histogram to get a more
* accurate weight.
*/
uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
deferspace_histogram[i] += msp->ms_synchist[i];
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
deferspace_histogram[i] += msp->ms_deferhist[t][i];
}
}
uint64_t weight = 0;
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
WEIGHT_SET_COUNT(weight,
msp->ms_sm->sm_phys->smp_histogram[i]);
WEIGHT_SET_INDEX(weight, i +
msp->ms_sm->sm_shift);
ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
deferspace_histogram[i]);
uint64_t count =
sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
if (count != 0) {
WEIGHT_SET_COUNT(weight, count);
WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
@ -1977,7 +2299,7 @@ metaslab_segment_weight(metaslab_t *msp)
/*
* The metaslab is completely free.
*/
if (space_map_allocated(msp->ms_sm) == 0) {
if (metaslab_allocated_space(msp) == 0) {
int idx = highbit64(msp->ms_size) - 1;
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
@ -1999,7 +2321,7 @@ metaslab_segment_weight(metaslab_t *msp)
/*
* If the metaslab is fully allocated then just make the weight 0.
*/
if (space_map_allocated(msp->ms_sm) == msp->ms_size)
if (metaslab_allocated_space(msp) == msp->ms_size)
return (0);
/*
* If the metaslab is already loaded, then use the range tree to
@ -2080,6 +2402,8 @@ metaslab_weight(metaslab_t *msp)
*/
if (msp->ms_loaded)
msp->ms_max_size = metaslab_block_maxsize(msp);
else
ASSERT0(msp->ms_max_size);
/*
* Segment-based weighting requires space map histogram support.
@ -2095,6 +2419,15 @@ metaslab_weight(metaslab_t *msp)
return (weight);
}
void
metaslab_recalculate_weight_and_sort(metaslab_t *msp)
{
/* note: we preserve the mask (e.g. indication of primary, etc..) */
uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
metaslab_group_sort(msp->ms_group, msp,
metaslab_weight(msp) | was_active);
}
static int
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
int allocator, uint64_t activation_weight)
@ -2479,17 +2812,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY(txg <= spa_final_dirty_txg(spa));
/*
* The only state that can actually be changing concurrently with
* metaslab_sync() is the metaslab's ms_allocatable. No other
* thread can be modifying this txg's alloc, freeing,
* The only state that can actually be changing concurrently
* with metaslab_sync() is the metaslab's ms_allocatable. No
* other thread can be modifying this txg's alloc, freeing,
* freed, or space_map_phys_t. We drop ms_lock whenever we
* could call into the DMU, because the DMU can call down to us
* (e.g. via zio_free()) at any time.
* could call into the DMU, because the DMU can call down to
* us (e.g. via zio_free()) at any time.
*
* The spa_vdev_remove_thread() can be reading metaslab state
* concurrently, and it is locked out by the ms_sync_lock. Note
* that the ms_lock is insufficient for this, because it is dropped
* by space_map_write().
* concurrently, and it is locked out by the ms_sync_lock.
* Note that the ms_lock is insufficient for this, because it
* is dropped by space_map_write().
*/
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
@ -2501,7 +2834,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
msp->ms_start, msp->ms_size, vd->vdev_ashift));
ASSERT(msp->ms_sm != NULL);
ASSERT0(metaslab_allocated_space(msp));
}
if (!range_tree_is_empty(msp->ms_checkpointing) &&
@ -2549,6 +2884,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
mutex_enter(&msp->ms_lock);
}
msp->ms_allocated_space += range_tree_space(alloctree);
ASSERT3U(msp->ms_allocated_space, >=,
range_tree_space(msp->ms_freeing));
msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
if (!range_tree_is_empty(msp->ms_checkpointing)) {
ASSERT(spa_has_checkpoint(spa));
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
@ -2562,14 +2902,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_write(vd->vdev_checkpoint_sm,
msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
space_map_update(vd->vdev_checkpoint_sm);
spa->spa_checkpoint_info.sci_dspace +=
range_tree_space(msp->ms_checkpointing);
vd->vdev_stat.vs_checkpoint_space +=
range_tree_space(msp->ms_checkpointing);
ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
-vd->vdev_checkpoint_sm->sm_alloc);
-space_map_allocated(vd->vdev_checkpoint_sm));
range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
}
@ -2614,6 +2953,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* time we load the space map.
*/
space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
metaslab_aux_histograms_update(msp);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
@ -2621,16 +2961,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* For sync pass 1, we avoid traversing this txg's free range tree
* and instead will just swap the pointers for freeing and
* freed. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
* and instead will just swap the pointers for freeing and freed.
* We can safely do this since the freed_tree is guaranteed to be
* empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
ASSERT0(msp->ms_allocated_this_txg);
} else {
range_tree_vacate(msp->ms_freeing,
range_tree_add, msp->ms_freed);
}
msp->ms_allocated_this_txg += range_tree_space(alloctree);
range_tree_vacate(alloctree, NULL, NULL);
ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
@ -2708,7 +3050,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
}
defer_delta = 0;
alloc_delta = space_map_alloc_delta(msp->ms_sm);
alloc_delta = msp->ms_allocated_this_txg -
range_tree_space(msp->ms_freed);
if (defer_allowed) {
defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
@ -2740,7 +3083,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
msp->ms_loaded ? range_tree_add : NULL,
msp->ms_allocatable);
}
space_map_update(msp->ms_sm);
msp->ms_synced_length = space_map_length(msp->ms_sm);
msp->ms_deferspace += defer_delta;
ASSERT3S(msp->ms_deferspace, >=, 0);
@ -2752,6 +3096,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
*/
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
metaslab_aux_histograms_update_done(msp, defer_allowed);
if (msp->ms_new) {
msp->ms_new = B_FALSE;
@ -2759,12 +3104,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
mg->mg_ms_ready++;
mutex_exit(&mg->mg_lock);
}
/*
* Calculate the new weights before unloading any metaslabs.
* This will give us the most accurate weighting.
* Re-sort metaslab within its group now that we've adjusted
* its allocatable space.
*/
metaslab_group_sort(mg, msp, metaslab_weight(msp) |
(msp->ms_weight & METASLAB_ACTIVE_MASK));
metaslab_recalculate_weight_and_sort(msp);
/*
* If the metaslab is loaded and we've not tried to load or allocate
@ -2791,6 +3136,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_freed));
ASSERT0(range_tree_space(msp->ms_checkpointing));
msp->ms_allocated_this_txg = 0;
mutex_exit(&msp->ms_lock);
}
@ -4073,7 +4419,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
zio_alloc_list_t *zal, zio_t *zio, int allocator)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
ASSERT(bp->blk_birth == 0);
@ -4240,14 +4586,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
mutex_enter(&msp->ms_lock);
if (msp->ms_loaded)
range_tree_verify(msp->ms_allocatable, offset, size);
if (msp->ms_loaded) {
range_tree_verify_not_present(msp->ms_allocatable,
offset, size);
}
range_tree_verify(msp->ms_freeing, offset, size);
range_tree_verify(msp->ms_checkpointing, offset, size);
range_tree_verify(msp->ms_freed, offset, size);
range_tree_verify_not_present(msp->ms_freeing, offset, size);
range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
range_tree_verify_not_present(msp->ms_freed, offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
range_tree_verify(msp->ms_defer[j], offset, size);
range_tree_verify_not_present(msp->ms_defer[j], offset, size);
mutex_exit(&msp->ms_lock);
}

View File

@ -511,13 +511,11 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
}
void
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
{
range_seg_t *rs;
rs = range_tree_find(rt, off, size);
range_seg_t *rs = range_tree_find(rt, off, size);
if (rs != NULL)
panic("freeing free block; rs=%p", (void *)rs);
panic("segment already in tree; rs=%p", (void *)rs);
}
boolean_t

View File

@ -129,7 +129,7 @@
* uberblock would reference data in the removed device. For this reason
* and others of similar nature, we disallow the following operations that
* can change the config:
* vdev removal and attach/detach, mirror splitting, and pool reguid.
* vdev removal and attach/detach, mirror splitting, and pool reguid.
*
* - As most of the checkpoint logic is implemented in the SPA and doesn't
* distinguish datasets when it comes to space accounting, having a
@ -262,7 +262,7 @@ spa_checkpoint_accounting_verify(spa_t *spa)
if (vd->vdev_checkpoint_sm != NULL) {
ckpoint_sm_space_sum +=
-vd->vdev_checkpoint_sm->sm_alloc;
-space_map_allocated(vd->vdev_checkpoint_sm);
vs_ckpoint_space_sum +=
vd->vdev_stat.vs_checkpoint_space;
ASSERT3U(ckpoint_sm_space_sum, ==,
@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
error, vd->vdev_id);
}
ASSERT0(words_after);
ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
space_map_free(vd->vdev_checkpoint_sm, tx);

View File

@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -86,20 +86,22 @@ sm_entry_is_double_word(uint64_t e)
/*
* Iterate through the space map, invoking the callback on each (non-debug)
* space map entry.
* space map entry. Stop after reading 'end' bytes of the space map.
*/
int
space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
{
uint64_t sm_len = space_map_length(sm);
ASSERT3U(sm->sm_blksz, !=, 0);
uint64_t blksz = sm->sm_blksz;
dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
ASSERT3U(blksz, !=, 0);
ASSERT3U(end, <=, space_map_length(sm));
ASSERT0(P2PHASE(end, sizeof (uint64_t)));
dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
ZIO_PRIORITY_SYNC_READ);
uint64_t blksz = sm->sm_blksz;
int error = 0;
for (uint64_t block_base = 0; block_base < sm_len && error == 0;
for (uint64_t block_base = 0; block_base < end && error == 0;
block_base += blksz) {
dmu_buf_t *db;
error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
@ -108,7 +110,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
return (error);
uint64_t *block_start = db->db_data;
uint64_t block_length = MIN(sm_len - block_base, blksz);
uint64_t block_length = MIN(end - block_base, blksz);
uint64_t *block_end = block_start +
(block_length / sizeof (uint64_t));
@ -191,7 +193,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
* dmu_buf_hold().
*/
uint64_t last_word_offset =
sm->sm_phys->smp_objsize - sizeof (uint64_t);
sm->sm_phys->smp_length - sizeof (uint64_t);
error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (error != 0)
@ -204,7 +206,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
uint64_t *words = db->db_data;
*nwords =
(sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
(sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
@ -303,8 +305,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
uint64_t e = buf[i];
if (sm_entry_is_debug(e)) {
sm->sm_phys->smp_objsize -= sizeof (uint64_t);
space_map_update(sm);
sm->sm_phys->smp_length -= sizeof (uint64_t);
continue;
}
@ -359,15 +360,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
sm->sm_phys->smp_alloc -= entry_run;
else
sm->sm_phys->smp_alloc += entry_run;
sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
space_map_update(sm);
sm->sm_phys->smp_length -= words * sizeof (uint64_t);
}
}
if (space_map_length(sm) == 0) {
ASSERT0(error);
ASSERT0(sm->sm_phys->smp_objsize);
ASSERT0(sm->sm_alloc);
ASSERT0(space_map_allocated(sm));
}
zio_buf_free(buf, bufsz);
@ -395,6 +394,33 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
return (0);
}
/*
* Load the spacemap into the rangetree, like space_map_load. But only
* read the first 'length' bytes of the spacemap.
*/
int
space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
uint64_t length)
{
space_map_load_arg_t smla;
VERIFY0(range_tree_space(rt));
if (maptype == SM_FREE)
range_tree_add(rt, sm->sm_start, sm->sm_size);
smla.smla_rt = rt;
smla.smla_sm = sm;
smla.smla_type = maptype;
int err = space_map_iterate(sm, length,
space_map_load_callback, &smla);
if (err != 0)
range_tree_vacate(rt, NULL, NULL);
return (err);
}
/*
* Load the space map disk into the specified range tree. Segments of maptype
* are added to the range tree, other segment types are removed.
@ -402,30 +428,7 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
int
space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
{
uint64_t space;
int err;
space_map_load_arg_t smla;
VERIFY0(range_tree_space(rt));
space = space_map_allocated(sm);
if (maptype == SM_FREE) {
range_tree_add(rt, sm->sm_start, sm->sm_size);
space = sm->sm_size - space;
}
smla.smla_rt = rt;
smla.smla_sm = sm;
smla.smla_type = maptype;
err = space_map_iterate(sm, space_map_load_callback, &smla);
if (err == 0) {
VERIFY3U(range_tree_space(rt), ==, space);
} else {
range_tree_vacate(rt, NULL, NULL);
}
return (err);
return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
}
void
@ -511,10 +514,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
sizeof (dentry), &dentry, tx);
sm->sm_phys->smp_objsize += sizeof (dentry);
sm->sm_phys->smp_length += sizeof (dentry);
}
/*
@ -546,7 +549,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
uint64_t *block_base = db->db_data;
uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
uint64_t *block_cursor = block_base +
(sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
(sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
ASSERT3P(block_cursor, <=, block_end);
@ -569,7 +572,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
if (block_cursor == block_end) {
dmu_buf_rele(db, tag);
uint64_t next_word_offset = sm->sm_phys->smp_objsize;
uint64_t next_word_offset = sm->sm_phys->smp_length;
VERIFY0(dmu_buf_hold(sm->sm_os,
space_map_object(sm), next_word_offset,
tag, &db, DMU_READ_PREFETCH));
@ -599,7 +602,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
SM_DEBUG_SYNCPASS_ENCODE(0) |
SM_DEBUG_TXG_ENCODE(0);
block_cursor++;
sm->sm_phys->smp_objsize += sizeof (uint64_t);
sm->sm_phys->smp_length += sizeof (uint64_t);
ASSERT3P(block_cursor, ==, block_end);
continue;
}
@ -630,7 +633,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
words);
break;
}
sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
sm->sm_phys->smp_length += words * sizeof (uint64_t);
start += run_len;
size -= run_len;
@ -657,7 +660,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* We do this right after we write the intro debug entry
* because the estimate does not take it into account.
*/
uint64_t initial_objsize = sm->sm_phys->smp_objsize;
uint64_t initial_objsize = sm->sm_phys->smp_length;
uint64_t estimated_growth =
space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
@ -668,7 +671,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* and use that to get a hold of the last block, so we can
* start appending to it.
*/
uint64_t next_word_offset = sm->sm_phys->smp_objsize;
uint64_t next_word_offset = sm->sm_phys->smp_length;
VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
ASSERT3U(db->db_size, ==, sm->sm_blksz);
@ -716,7 +719,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* Therefore we expect the actual objsize to be equal or less
* than whatever we estimated it to be.
*/
ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
#endif
}
@ -872,23 +875,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
}
dmu_buf_will_dirty(sm->sm_dbuf, tx);
sm->sm_phys->smp_objsize = 0;
sm->sm_phys->smp_length = 0;
sm->sm_phys->smp_alloc = 0;
}
/*
* Update the in-core space_map allocation and length values.
*/
void
space_map_update(space_map_t *sm)
{
if (sm == NULL)
return;
sm->sm_alloc = sm->sm_phys->smp_alloc;
sm->sm_length = sm->sm_phys->smp_objsize;
}
uint64_t
space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
{
@ -1070,32 +1060,14 @@ space_map_object(space_map_t *sm)
return (sm != NULL ? sm->sm_object : 0);
}
/*
* Returns the already synced, on-disk allocated space.
*/
uint64_t
int64_t
space_map_allocated(space_map_t *sm)
{
return (sm != NULL ? sm->sm_alloc : 0);
return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
}
/*
* Returns the already synced, on-disk length;
*/
uint64_t
space_map_length(space_map_t *sm)
{
return (sm != NULL ? sm->sm_length : 0);
}
/*
* Returns the allocated space that is currently syncing.
*/
int64_t
space_map_alloc_delta(space_map_t *sm)
{
if (sm == NULL)
return (0);
ASSERT(sm->sm_dbuf != NULL);
return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
return (sm != NULL ? sm->sm_phys->smp_length : 0);
}

View File

@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
uint64_t metaslab_allocated_space(metaslab_t *);
void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
@ -116,6 +118,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
#ifdef __cplusplus
}

View File

@ -341,8 +341,34 @@ struct metaslab_group {
* being written.
*/
struct metaslab {
/*
* This is the main lock of the metaslab and its purpose is to
* coordinate our allocations and frees [e.g metaslab_block_alloc(),
* metaslab_free_concrete(), ..etc] with our various syncing
* procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
*
* The lock is also used during some miscellaneous operations like
* using the metaslab's histogram for the metaslab group's histogram
* aggregation, or marking the metaslab for initialization.
*/
kmutex_t ms_lock;
/*
* Acquired together with the ms_lock whenever we expect to
* write to metaslab data on-disk (i.e flushing entries to
* the metaslab's space map). It helps coordinate readers of
* the metaslab's space map [see spa_vdev_remove_thread()]
* with writers [see metaslab_sync()].
*
* Note that metaslab_load(), even though a reader, uses
* a completely different mechanism to deal with the reading
* of the metaslab's space map based on ms_synced_length. That
* said, the function still uses the ms_sync_lock after it
* has read the ms_sm [see relevant comment in metaslab_load()
* as to why].
*/
kmutex_t ms_sync_lock;
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
@ -352,6 +378,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
uint64_t ms_allocated_this_txg;
/*
* The following range trees are accessed only from syncing context.
@ -376,6 +403,55 @@ struct metaslab {
boolean_t ms_loaded;
boolean_t ms_loading;
/*
* The following histograms count entries that are in the
* metaslab's space map (and its histogram) but are not in
* ms_allocatable yet, because they are in ms_freed, ms_freeing,
* or ms_defer[].
*
* When the metaslab is not loaded, its ms_weight needs to
* reflect what is allocatable (i.e. what will be part of
* ms_allocatable if it is loaded). The weight is computed from
* the spacemap histogram, but that includes ranges that are
* not yet allocatable (because they are in ms_freed,
* ms_freeing, or ms_defer[]). Therefore, when calculating the
* weight, we need to remove those ranges.
*
* The ranges in the ms_freed and ms_defer[] range trees are all
* present in the spacemap. However, the spacemap may have
* multiple entries to represent a contiguous range, because it
* is written across multiple sync passes, but the changes of
* all sync passes are consolidated into the range trees.
* Adjacent ranges that are freed in different sync passes of
* one txg will be represented separately (as 2 or more entries)
* in the space map (and its histogram), but these adjacent
* ranges will be consolidated (represented as one entry) in the
* ms_freed/ms_defer[] range trees (and their histograms).
*
* When calculating the weight, we can not simply subtract the
* range trees' histograms from the spacemap's histogram,
* because the range trees' histograms may have entries in
* higher buckets than the spacemap, due to consolidation.
* Instead we must subtract the exact entries that were added to
* the spacemap's histogram. ms_synchist and ms_deferhist[]
* represent these exact entries, so we can subtract them from
* the spacemap's histogram when calculating ms_weight.
*
* ms_synchist represents the same ranges as ms_freeing +
* ms_freed, but without consolidation across sync passes.
*
* ms_deferhist[i] represents the same ranges as ms_defer[i],
* but without consolidation across sync passes.
*/
uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
/*
* Tracks the exact amount of allocated space of this metaslab
* (and specifically the metaslab's space map) up to the most
* recently completed sync pass [see usage in metaslab_sync()].
*/
uint64_t ms_allocated_space;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_activation_weight; /* activation weight */
@ -412,6 +488,9 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
/* updated every time we are done syncing the metaslab's space map */
uint64_t ms_synced_length;
boolean_t ms_new;
};

View File

@ -87,12 +87,13 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_verify_not_present(range_tree_t *rt,
uint64_t start, uint64_t size);
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
boolean_t range_tree_is_empty(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
uint64_t range_tree_min(range_tree_t *rt);

View File

@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@ -55,10 +55,17 @@ extern "C" {
* for backward compatibility.
*/
typedef struct space_map_phys {
uint64_t smp_object; /* on-disk space map object */
uint64_t smp_objsize; /* size of the object */
int64_t smp_alloc; /* space allocated from the map */
uint64_t smp_pad[5]; /* reserved */
/* object number: not needed but kept for backwards compatibility */
uint64_t smp_object;
/* length of the object in bytes */
uint64_t smp_length;
/* space allocated from the map */
int64_t smp_alloc;
/* reserved */
uint64_t smp_pad[5];
/*
* The smp_histogram maintains a histogram of free regions. Each
@ -81,8 +88,6 @@ typedef struct space_map {
uint64_t sm_start; /* start of map */
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
uint64_t sm_length; /* synced length */
int64_t sm_alloc; /* synced space allocated */
objset_t *sm_os; /* objset for this map */
uint64_t sm_object; /* object id for this map */
uint32_t sm_blksz; /* block size for space map */
@ -189,18 +194,20 @@ boolean_t sm_entry_is_double_word(uint64_t e);
typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
uint64_t length);
int space_map_iterate(space_map_t *sm, uint64_t length,
sm_cb_t callback, void *arg);
int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
dmu_tx_t *tx);
boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
dmu_tx_t *tx);
void space_map_update(space_map_t *sm);
uint64_t space_map_object(space_map_t *sm);
uint64_t space_map_allocated(space_map_t *sm);
int64_t space_map_allocated(space_map_t *sm);
uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
@ -216,8 +223,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
uint64_t start, uint64_t size, uint8_t shift);
void space_map_close(space_map_t *sm);
int64_t space_map_alloc_delta(space_map_t *sm);
#ifdef __cplusplus
}
#endif

View File

@ -268,7 +268,6 @@ struct vdev {
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
boolean_t vdev_ishole; /* is a hole in the namespace */
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
@ -327,16 +326,6 @@ struct vdev {
range_tree_t *vdev_obsolete_segments;
space_map_t *vdev_obsolete_sm;
/*
* The queue depth parameters determine how many async writes are
* still pending (i.e. allocated but not yet issued to disk) per
* top-level (vdev_async_write_queue_depth) and the maximum allowed
* (vdev_max_async_write_queue_depth). These values only apply to
* top-level vdevs.
*/
uint64_t vdev_async_write_queue_depth;
uint64_t vdev_max_async_write_queue_depth;
/*
* Protects the vdev_scan_io_queue field itself as well as the
* structure's contents (when present).

View File

@ -630,7 +630,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1032,7 +1031,6 @@ vdev_free(vdev_t *vd)
rw_destroy(&vd->vdev_indirect_rwlock);
mutex_destroy(&vd->vdev_obsolete_lock);
mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
@ -1401,12 +1399,12 @@ vdev_metaslab_fini(vdev_t *vd)
}
if (vd->vdev_ms != NULL) {
uint64_t count = vd->vdev_ms_count;
metaslab_group_t *mg = vd->vdev_mg;
metaslab_group_passivate(mg);
metaslab_group_passivate(vd->vdev_mg);
uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp != NULL)
metaslab_fini(msp);
}
@ -1414,6 +1412,9 @@ vdev_metaslab_fini(vdev_t *vd)
vd->vdev_ms = NULL;
vd->vdev_ms_count = 0;
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
}
ASSERT0(vd->vdev_ms_count);
}
@ -2767,13 +2768,6 @@ vdev_dtl_load(vdev_t *vd)
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
/*
* Now that we've opened the space_map we need to update
* the in-core DTL.
*/
space_map_update(vd->vdev_dtl_sm);
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
@ -2933,10 +2927,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
}
dmu_tx_commit(tx);
mutex_enter(&vd->vdev_dtl_lock);
space_map_update(vd->vdev_dtl_sm);
mutex_exit(&vd->vdev_dtl_lock);
}
/*
@ -3079,7 +3069,10 @@ vdev_load(vdev_t *vd)
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
(u_longlong_t)vd->vdev_asize);
return (SET_ERROR(ENXIO));
} else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
}
error = vdev_metaslab_init(vd, 0);
if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
"[error=%d]", error);
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@ -3093,9 +3086,10 @@ vdev_load(vdev_t *vd)
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
if ((error = space_map_open(&vd->vdev_checkpoint_sm,
error = space_map_open(&vd->vdev_checkpoint_sm,
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
vd->vdev_ashift))) {
vd->vdev_ashift);
if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: space_map_open "
"failed for checkpoint spacemap (obj %llu) "
"[error=%d]",
@ -3103,15 +3097,15 @@ vdev_load(vdev_t *vd)
return (error);
}
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
space_map_update(vd->vdev_checkpoint_sm);
/*
* Since the checkpoint_sm contains free entries
* exclusively we can use sm_alloc to indicate the
* culmulative checkpointed space that has been freed.
* exclusively we can use space_map_allocated() to
* indicate the cumulative checkpointed space that
* has been freed.
*/
vd->vdev_stat.vs_checkpoint_space =
-vd->vdev_checkpoint_sm->sm_alloc;
-space_map_allocated(vd->vdev_checkpoint_sm);
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
vd->vdev_stat.vs_checkpoint_space;
}
@ -3143,7 +3137,6 @@ vdev_load(vdev_t *vd)
(u_longlong_t)obsolete_sm_object, error);
return (error);
}
space_map_update(vd->vdev_obsolete_sm);
}
return (0);
@ -3230,47 +3223,6 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
if (vd->vdev_ms != NULL) {
metaslab_group_t *mg = vd->vdev_mg;
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp == NULL || msp->ms_sm == NULL)
continue;
mutex_enter(&msp->ms_lock);
/*
* If the metaslab was not loaded when the vdev
* was removed then the histogram accounting may
* not be accurate. Update the histogram information
* here so that we ensure that the metaslab group
* and metaslab class are up-to-date.
*/
metaslab_group_histogram_remove(mg, msp);
VERIFY0(space_map_allocated(msp->ms_sm));
space_map_close(msp->ms_sm);
msp->ms_sm = NULL;
mutex_exit(&msp->ms_lock);
}
if (vd->vdev_checkpoint_sm != NULL) {
ASSERT(spa_has_checkpoint(spa));
space_map_close(vd->vdev_checkpoint_sm);
vd->vdev_checkpoint_sm = NULL;
}
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
}
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
vdev_destroy_spacemaps(vd, tx);
@ -3304,17 +3256,14 @@ vdev_sync(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
dmu_tx_t *tx;
ASSERT3U(txg, ==, spa->spa_syncing_txg);
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
dmu_tx_t *tx;
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vdev_indirect_sync_obsolete(vd, tx);
dmu_tx_commit(tx);
/*
* If the vdev is indirect, it can't have dirty
@ -3323,6 +3272,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
if (vd->vdev_ops == &vdev_indirect_ops) {
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
dmu_tx_commit(tx);
return;
}
}
@ -3333,12 +3283,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
!vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
dmu_tx_commit(tx);
}
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@ -3357,6 +3305,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
vdev_remove_empty_log(vd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
dmu_tx_commit(tx);
}
uint64_t
@ -3586,8 +3535,6 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
*/
if (error == 0 &&
tvd->vdev_checkpoint_sm != NULL) {
ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
!=, 0);
error = ZFS_ERR_CHECKPOINT_EXISTS;
}

View File

@ -680,7 +680,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
space_map_update(prev_obsolete_sm);
counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
if (prev_obsolete_sm != NULL) {
vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
@ -831,7 +830,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
spa->spa_meta_objset, obsolete_sm_object,
0, vd->vdev_asize, 0));
space_map_update(vd->vdev_obsolete_sm);
}
ASSERT(vd->vdev_obsolete_sm != NULL);
@ -840,7 +838,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
space_map_write(vd->vdev_obsolete_sm,
vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
space_map_update(vd->vdev_obsolete_sm);
range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
}

View File

@ -557,6 +557,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
losma.losma_counts = counts;
losma.losma_vim = vim;
VERIFY0(space_map_iterate(obsolete_space_sm,
space_map_length(obsolete_space_sm),
load_obsolete_sm_callback, &losma));
}

View File

@ -442,7 +442,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
space_map_allocated(msp->ms_sm);
metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;

View File

@ -283,15 +283,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
if (ms->ms_sm == NULL)
continue;
/*
* Sync tasks happen before metaslab_sync(), therefore
* smp_alloc and sm_alloc must be the same.
*/
ASSERT3U(space_map_allocated(ms->ms_sm), ==,
ms->ms_sm->sm_phys->smp_alloc);
spa->spa_removing_phys.sr_to_copy +=
space_map_allocated(ms->ms_sm);
metaslab_allocated_space(ms);
/*
* Space which we are freeing this txg does not need to
@ -1401,22 +1394,8 @@ spa_vdev_remove_thread(void *arg)
* appropriate action (see free_from_removing_vdev()).
*/
if (msp->ms_sm != NULL) {
space_map_t *sm = NULL;
/*
* We have to open a new space map here, because
* ms_sm's sm_length and sm_alloc may not reflect
* what's in the object contents, if we are in between
* metaslab_sync() and metaslab_sync_done().
*/
VERIFY0(space_map_open(&sm,
spa->spa_dsl_pool->dp_meta_objset,
msp->ms_sm->sm_object, msp->ms_sm->sm_start,
msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
space_map_update(sm);
VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
SM_ALLOC));
space_map_close(sm);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
@ -1612,16 +1591,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
ASSERT0(range_tree_space(msp->ms_freed));
if (msp->ms_sm != NULL) {
/*
* Assert that the in-core spacemap has the same
* length as the on-disk one, so we can use the
* existing in-core spacemap to load it from disk.
*/
ASSERT3U(msp->ms_sm->sm_alloc, ==,
msp->ms_sm->sm_phys->smp_alloc);
ASSERT3U(msp->ms_sm->sm_length, ==,
msp->ms_sm->sm_phys->smp_objsize);
mutex_enter(&svr->svr_lock);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
@ -1714,9 +1683,6 @@ spa_vdev_remove_cancel(spa_t *spa)
return (error);
}
/*
* Called every sync pass of every txg if there's a svr.
*/
void
svr_sync(spa_t *spa, dmu_tx_t *tx)
{
@ -1780,6 +1746,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
ASSERT(vd->vdev_islog);
ASSERT(vd == vd->vdev_top);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
* Stop allocating from this vdev.
@ -1794,15 +1761,14 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
/*
* Evacuate the device. We don't hold the config lock as writer
* since we need to do I/O but we do keep the
* Evacuate the device. We don't hold the config lock as
* writer since we need to do I/O but we do keep the
* spa_namespace_lock held. Once this completes the device
* should no longer have any blocks allocated on it.
*/
if (vd->vdev_islog) {
if (vd->vdev_stat.vs_alloc != 0)
error = spa_reset_logs(spa);
}
ASSERT(MUTEX_HELD(&spa_namespace_lock));
if (vd->vdev_stat.vs_alloc != 0)
error = spa_reset_logs(spa);
*txg = spa_vdev_config_enter(spa);
@ -1821,6 +1787,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
vdev_dirty_leaves(vd, VDD_DTL, *txg);
vdev_config_dirty(vd);
vdev_metaslab_fini(vd);
spa_history_log_internal(spa, "vdev remove", NULL,
"%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
(vd->vdev_path != NULL) ? vd->vdev_path : "-");
@ -1850,6 +1818,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
if (list_link_active(&vd->vdev_config_dirty_node))
vdev_config_clean(vd);
ASSERT0(vd->vdev_stat.vs_alloc);
/*
* Clean up the vdev namespace.
*/