mirror of
https://git.FreeBSD.org/src.git
synced 2024-12-03 09:00:21 +00:00
MFV r354383: 10592 misc. metaslab and vdev related ZoL bug fixes
illumos/illumos-gate@555d674d5d555d674d5d
https://www.illumos.org/issues/10592 This is a collection of recent fixes from ZoL:8eef997679
Error path in metaslab_load_impl() forgets to drop ms_sync_lock928e8ad47d
Introduce auxiliary metaslab histograms425d3237ee
Get rid of space_map_update() for ms_synced_length6c926f426a
Simplify log vdev removal code21e7cf5da8
zdb -L should skip leak detection altogetherdf72b8bebe
Rename range_tree_verify to range_tree_verify_not_present75058f3303
Remove unused vdev_t fields Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Author: Serapheim Dimitropoulos <serapheim@delphix.com> MFC after: 4 weeks
This commit is contained in:
commit
8491540808
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=354948
@ -10,7 +10,7 @@
|
||||
.\"
|
||||
.\"
|
||||
.\" Copyright 2012, Richard Lowe.
|
||||
.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
.\" Copyright 2017 Nexenta Systems, Inc.
|
||||
.\"
|
||||
.Dd October 06, 2017
|
||||
@ -187,7 +187,7 @@ If the
|
||||
.Fl u
|
||||
option is also specified, also display the uberblocks on this device.
|
||||
.It Fl L
|
||||
Disable leak tracing and the loading of space maps.
|
||||
Disable leak detection and the loading of space maps.
|
||||
By default,
|
||||
.Nm
|
||||
verifies that all non-free blocks are referenced, which can be very expensive.
|
||||
|
@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm)
|
||||
return;
|
||||
|
||||
(void) printf("space map object %llu:\n",
|
||||
(longlong_t)sm->sm_phys->smp_object);
|
||||
(void) printf(" smp_objsize = 0x%llx\n",
|
||||
(longlong_t)sm->sm_phys->smp_objsize);
|
||||
(longlong_t)sm->sm_object);
|
||||
(void) printf(" smp_length = 0x%llx\n",
|
||||
(longlong_t)sm->sm_phys->smp_length);
|
||||
(void) printf(" smp_alloc = 0x%llx\n",
|
||||
(longlong_t)sm->sm_phys->smp_alloc);
|
||||
|
||||
if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Print out the freelist entries in both encoded and decoded form.
|
||||
*/
|
||||
uint8_t mapshift = sm->sm_shift;
|
||||
int64_t alloc = 0;
|
||||
uint64_t word;
|
||||
uint64_t word, entry_id = 0;
|
||||
for (uint64_t offset = 0; offset < space_map_length(sm);
|
||||
offset += sizeof (word)) {
|
||||
|
||||
@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm)
|
||||
sizeof (word), &word, DMU_READ_PREFETCH));
|
||||
|
||||
if (sm_entry_is_debug(word)) {
|
||||
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
|
||||
(u_longlong_t)(offset / sizeof (word)),
|
||||
(void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
|
||||
(u_longlong_t)entry_id,
|
||||
ddata[SM_DEBUG_ACTION_DECODE(word)],
|
||||
(u_longlong_t)SM_DEBUG_TXG_DECODE(word),
|
||||
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
|
||||
entry_id++;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)
|
||||
|
||||
(void) printf("\t [%6llu] %c range:"
|
||||
" %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
|
||||
(u_longlong_t)(offset / sizeof (word)),
|
||||
(u_longlong_t)entry_id,
|
||||
entry_type, (u_longlong_t)entry_off,
|
||||
(u_longlong_t)(entry_off + entry_run),
|
||||
(u_longlong_t)entry_run,
|
||||
@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm)
|
||||
alloc += entry_run;
|
||||
else
|
||||
alloc -= entry_run;
|
||||
entry_id++;
|
||||
}
|
||||
if ((uint64_t)alloc != space_map_allocated(sm)) {
|
||||
if (alloc != space_map_allocated(sm)) {
|
||||
(void) printf("space_map_object alloc (%lld) INCONSISTENT "
|
||||
"with space map summary (%lld)\n",
|
||||
(longlong_t)space_map_allocated(sm), (longlong_t)alloc);
|
||||
@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp)
|
||||
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
|
||||
}
|
||||
|
||||
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
|
||||
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
|
||||
|
||||
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
|
||||
}
|
||||
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
|
||||
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -3140,6 +3142,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
||||
ddt_entry_t dde;
|
||||
int error;
|
||||
|
||||
ASSERT(!dump_opt['L']);
|
||||
|
||||
bzero(&ddb, sizeof (ddb));
|
||||
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
|
||||
blkptr_t blk;
|
||||
@ -3163,12 +3167,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
||||
zcb->zcb_dedup_blocks++;
|
||||
}
|
||||
}
|
||||
if (!dump_opt['L']) {
|
||||
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
|
||||
ddt_enter(ddt);
|
||||
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
|
||||
ddt_exit(ddt);
|
||||
}
|
||||
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
|
||||
ddt_enter(ddt);
|
||||
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
|
||||
ddt_exit(ddt);
|
||||
}
|
||||
|
||||
ASSERT(error == ENOENT);
|
||||
@ -3210,6 +3212,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
|
||||
static void
|
||||
zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
|
||||
{
|
||||
if (dump_opt['L'])
|
||||
return;
|
||||
|
||||
if (spa->spa_vdev_removal == NULL)
|
||||
return;
|
||||
|
||||
@ -3301,7 +3306,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
|
||||
space_map_t *prev_obsolete_sm = NULL;
|
||||
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
|
||||
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
|
||||
space_map_update(prev_obsolete_sm);
|
||||
vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
|
||||
prev_obsolete_sm);
|
||||
space_map_close(prev_obsolete_sm);
|
||||
@ -3395,9 +3399,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
|
||||
|
||||
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
|
||||
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
|
||||
space_map_update(checkpoint_sm);
|
||||
|
||||
VERIFY0(space_map_iterate(checkpoint_sm,
|
||||
space_map_length(checkpoint_sm),
|
||||
checkpoint_sm_exclude_entry_cb, &cseea));
|
||||
space_map_close(checkpoint_sm);
|
||||
|
||||
@ -3407,6 +3411,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
|
||||
static void
|
||||
zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
|
||||
{
|
||||
ASSERT(!dump_opt['L']);
|
||||
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
|
||||
ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
|
||||
@ -3503,6 +3509,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
|
||||
static void
|
||||
zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
|
||||
{
|
||||
ASSERT(!dump_opt['L']);
|
||||
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *vd = rvd->vdev_child[c];
|
||||
@ -3549,67 +3557,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
||||
{
|
||||
zcb->zcb_spa = spa;
|
||||
|
||||
if (!dump_opt['L']) {
|
||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
if (dump_opt['L'])
|
||||
return;
|
||||
|
||||
/*
|
||||
* We are going to be changing the meaning of the metaslab's
|
||||
* ms_allocatable. Ensure that the allocator doesn't try to
|
||||
* use the tree.
|
||||
*/
|
||||
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
|
||||
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
|
||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
|
||||
zcb->zcb_vd_obsolete_counts =
|
||||
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
|
||||
UMEM_NOFAIL);
|
||||
/*
|
||||
* We are going to be changing the meaning of the metaslab's
|
||||
* ms_allocatable. Ensure that the allocator doesn't try to
|
||||
* use the tree.
|
||||
*/
|
||||
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
|
||||
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
|
||||
|
||||
/*
|
||||
* For leak detection, we overload the ms_allocatable trees
|
||||
* to contain allocated segments instead of free segments.
|
||||
* As a result, we can't use the normal metaslab_load/unload
|
||||
* interfaces.
|
||||
*/
|
||||
zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
|
||||
load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
|
||||
zcb->zcb_vd_obsolete_counts =
|
||||
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
|
||||
UMEM_NOFAIL);
|
||||
|
||||
/*
|
||||
* On load_concrete_ms_allocatable_trees() we loaded all the
|
||||
* allocated entries from the ms_sm to the ms_allocatable for
|
||||
* each metaslab. If the pool has a checkpoint or is in the
|
||||
* middle of discarding a checkpoint, some of these blocks
|
||||
* may have been freed but their ms_sm may not have been
|
||||
* updated because they are referenced by the checkpoint. In
|
||||
* order to avoid false-positives during leak-detection, we
|
||||
* go through the vdev's checkpoint space map and exclude all
|
||||
* its entries from their relevant ms_allocatable.
|
||||
*
|
||||
* We also aggregate the space held by the checkpoint and add
|
||||
* it to zcb_checkpoint_size.
|
||||
*
|
||||
* Note that at this point we are also verifying that all the
|
||||
* entries on the checkpoint_sm are marked as allocated in
|
||||
* the ms_sm of their relevant metaslab.
|
||||
* [see comment in checkpoint_sm_exclude_entry_cb()]
|
||||
*/
|
||||
zdb_leak_init_exclude_checkpoint(spa, zcb);
|
||||
/*
|
||||
* For leak detection, we overload the ms_allocatable trees
|
||||
* to contain allocated segments instead of free segments.
|
||||
* As a result, we can't use the normal metaslab_load/unload
|
||||
* interfaces.
|
||||
*/
|
||||
zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
|
||||
load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
|
||||
|
||||
/* for cleaner progress output */
|
||||
(void) fprintf(stderr, "\n");
|
||||
/*
|
||||
* On load_concrete_ms_allocatable_trees() we loaded all the
|
||||
* allocated entries from the ms_sm to the ms_allocatable for
|
||||
* each metaslab. If the pool has a checkpoint or is in the
|
||||
* middle of discarding a checkpoint, some of these blocks
|
||||
* may have been freed but their ms_sm may not have been
|
||||
* updated because they are referenced by the checkpoint. In
|
||||
* order to avoid false-positives during leak-detection, we
|
||||
* go through the vdev's checkpoint space map and exclude all
|
||||
* its entries from their relevant ms_allocatable.
|
||||
*
|
||||
* We also aggregate the space held by the checkpoint and add
|
||||
* it to zcb_checkpoint_size.
|
||||
*
|
||||
* Note that at this point we are also verifying that all the
|
||||
* entries on the checkpoint_sm are marked as allocated in
|
||||
* the ms_sm of their relevant metaslab.
|
||||
* [see comment in checkpoint_sm_exclude_entry_cb()]
|
||||
*/
|
||||
zdb_leak_init_exclude_checkpoint(spa, zcb);
|
||||
ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
|
||||
|
||||
if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
|
||||
ASSERT(spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_DEVICE_REMOVAL));
|
||||
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
|
||||
increment_indirect_mapping_cb, zcb, NULL);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If leak tracing is disabled, we still need to consider
|
||||
* any checkpointed space in our space verification.
|
||||
*/
|
||||
zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
|
||||
/* for cleaner progress output */
|
||||
(void) fprintf(stderr, "\n");
|
||||
|
||||
if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
|
||||
ASSERT(spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_DEVICE_REMOVAL));
|
||||
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
|
||||
increment_indirect_mapping_cb, zcb, NULL);
|
||||
}
|
||||
|
||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
@ -3690,52 +3694,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
|
||||
static boolean_t
|
||||
zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
|
||||
{
|
||||
boolean_t leaks = B_FALSE;
|
||||
if (!dump_opt['L']) {
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
for (unsigned c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *vd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
if (dump_opt['L'])
|
||||
return (B_FALSE);
|
||||
|
||||
if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
|
||||
leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
|
||||
boolean_t leaks = B_FALSE;
|
||||
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
for (unsigned c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *vd = rvd->vdev_child[c];
|
||||
#if DEBUG
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
#endif
|
||||
|
||||
if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
|
||||
leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
|
||||
}
|
||||
|
||||
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
ASSERT3P(mg, ==, msp->ms_group);
|
||||
|
||||
/*
|
||||
* ms_allocatable has been overloaded
|
||||
* to contain allocated segments. Now that
|
||||
* we finished traversing all blocks, any
|
||||
* block that remains in the ms_allocatable
|
||||
* represents an allocated block that we
|
||||
* did not claim during the traversal.
|
||||
* Claimed blocks would have been removed
|
||||
* from the ms_allocatable. For indirect
|
||||
* vdevs, space remaining in the tree
|
||||
* represents parts of the mapping that are
|
||||
* not referenced, which is not a bug.
|
||||
*/
|
||||
if (vd->vdev_ops == &vdev_indirect_ops) {
|
||||
range_tree_vacate(msp->ms_allocatable,
|
||||
NULL, NULL);
|
||||
} else {
|
||||
range_tree_vacate(msp->ms_allocatable,
|
||||
zdb_leak, vd);
|
||||
}
|
||||
|
||||
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
ASSERT3P(mg, ==, msp->ms_group);
|
||||
|
||||
/*
|
||||
* ms_allocatable has been overloaded
|
||||
* to contain allocated segments. Now that
|
||||
* we finished traversing all blocks, any
|
||||
* block that remains in the ms_allocatable
|
||||
* represents an allocated block that we
|
||||
* did not claim during the traversal.
|
||||
* Claimed blocks would have been removed
|
||||
* from the ms_allocatable. For indirect
|
||||
* vdevs, space remaining in the tree
|
||||
* represents parts of the mapping that are
|
||||
* not referenced, which is not a bug.
|
||||
*/
|
||||
if (vd->vdev_ops == &vdev_indirect_ops) {
|
||||
range_tree_vacate(msp->ms_allocatable,
|
||||
NULL, NULL);
|
||||
} else {
|
||||
range_tree_vacate(msp->ms_allocatable,
|
||||
zdb_leak, vd);
|
||||
}
|
||||
|
||||
if (msp->ms_loaded) {
|
||||
msp->ms_loaded = B_FALSE;
|
||||
}
|
||||
if (msp->ms_loaded) {
|
||||
msp->ms_loaded = B_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
umem_free(zcb->zcb_vd_obsolete_counts,
|
||||
rvd->vdev_children * sizeof (uint32_t *));
|
||||
zcb->zcb_vd_obsolete_counts = NULL;
|
||||
}
|
||||
|
||||
umem_free(zcb->zcb_vd_obsolete_counts,
|
||||
rvd->vdev_children * sizeof (uint32_t *));
|
||||
zcb->zcb_vd_obsolete_counts = NULL;
|
||||
|
||||
return (leaks);
|
||||
}
|
||||
|
||||
@ -3774,13 +3784,18 @@ dump_block_stats(spa_t *spa)
|
||||
!dump_opt['L'] ? "nothing leaked " : "");
|
||||
|
||||
/*
|
||||
* Load all space maps as SM_ALLOC maps, then traverse the pool
|
||||
* claiming each block we discover. If the pool is perfectly
|
||||
* consistent, the space maps will be empty when we're done.
|
||||
* Anything left over is a leak; any block we can't claim (because
|
||||
* it's not part of any space map) is a double allocation,
|
||||
* reference to a freed block, or an unclaimed log block.
|
||||
* When leak detection is enabled we load all space maps as SM_ALLOC
|
||||
* maps, then traverse the pool claiming each block we discover. If
|
||||
* the pool is perfectly consistent, the segment trees will be empty
|
||||
* when we're done. Anything left over is a leak; any block we can't
|
||||
* claim (because it's not part of any space map) is a double
|
||||
* allocation, reference to a freed block, or an unclaimed log block.
|
||||
*
|
||||
* When leak detection is disabled (-L option) we still traverse the
|
||||
* pool claiming each block we discover, but we skip opening any space
|
||||
* maps.
|
||||
*/
|
||||
bzero(&zcb, sizeof (zdb_cb_t));
|
||||
zdb_leak_init(spa, &zcb);
|
||||
|
||||
/*
|
||||
@ -3859,11 +3874,10 @@ dump_block_stats(spa_t *spa)
|
||||
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
|
||||
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
|
||||
|
||||
if (total_found == total_alloc) {
|
||||
if (!dump_opt['L'])
|
||||
(void) printf("\n\tNo leaks (block sum matches space"
|
||||
" maps exactly)\n");
|
||||
} else {
|
||||
if (total_found == total_alloc && !dump_opt['L']) {
|
||||
(void) printf("\n\tNo leaks (block sum matches space"
|
||||
" maps exactly)\n");
|
||||
} else if (!dump_opt['L']) {
|
||||
(void) printf("block traversal size %llu != alloc %llu "
|
||||
"(%s %lld)\n",
|
||||
(u_longlong_t)total_found,
|
||||
@ -4203,7 +4217,6 @@ verify_device_removal_feature_counts(spa_t *spa)
|
||||
spa->spa_meta_objset,
|
||||
scip->scip_prev_obsolete_sm_object,
|
||||
0, vd->vdev_asize, 0));
|
||||
space_map_update(prev_obsolete_sm);
|
||||
dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
|
||||
(void) printf("\n");
|
||||
space_map_close(prev_obsolete_sm);
|
||||
@ -4409,7 +4422,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
|
||||
* their respective ms_allocateable trees should not contain them.
|
||||
*/
|
||||
mutex_enter(&ms->ms_lock);
|
||||
range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
|
||||
range_tree_verify_not_present(ms->ms_allocatable,
|
||||
sme->sme_offset, sme->sme_run);
|
||||
mutex_exit(&ms->ms_lock);
|
||||
|
||||
return (0);
|
||||
@ -4472,7 +4486,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
|
||||
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
|
||||
checkpoint_sm_obj, 0, current_vd->vdev_asize,
|
||||
current_vd->vdev_ashift));
|
||||
space_map_update(checkpoint_sm);
|
||||
|
||||
verify_checkpoint_sm_entry_cb_arg_t vcsec;
|
||||
vcsec.vcsec_vd = ckpoint_vd;
|
||||
@ -4480,6 +4493,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
|
||||
vcsec.vcsec_num_entries =
|
||||
space_map_length(checkpoint_sm) / sizeof (uint64_t);
|
||||
VERIFY0(space_map_iterate(checkpoint_sm,
|
||||
space_map_length(checkpoint_sm),
|
||||
verify_checkpoint_sm_entry_cb, &vcsec));
|
||||
dump_spacemap(current->spa_meta_objset, checkpoint_sm);
|
||||
space_map_close(checkpoint_sm);
|
||||
@ -4559,7 +4573,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
|
||||
* are part of the checkpoint were freed by mistake.
|
||||
*/
|
||||
range_tree_walk(ckpoint_msp->ms_allocatable,
|
||||
(range_tree_func_t *)range_tree_verify,
|
||||
(range_tree_func_t *)range_tree_verify_not_present,
|
||||
current_msp->ms_allocatable);
|
||||
}
|
||||
}
|
||||
@ -4571,6 +4585,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
|
||||
static void
|
||||
verify_checkpoint_blocks(spa_t *spa)
|
||||
{
|
||||
ASSERT(!dump_opt['L']);
|
||||
|
||||
spa_t *checkpoint_spa;
|
||||
char *checkpoint_pool;
|
||||
nvlist_t *config = NULL;
|
||||
@ -4636,7 +4652,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa)
|
||||
|
||||
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
|
||||
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
|
||||
space_map_update(checkpoint_sm);
|
||||
dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
|
||||
space_map_close(checkpoint_sm);
|
||||
}
|
||||
|
@ -584,45 +584,62 @@ metaslab_compare(const void *x1, const void *x2)
|
||||
return (AVL_CMP(m1->ms_start, m2->ms_start));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
metaslab_allocated_space(metaslab_t *msp)
|
||||
{
|
||||
return (msp->ms_allocated_space);
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that the space accounting on disk matches the in-core range_trees.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
metaslab_verify_space(metaslab_t *msp, uint64_t txg)
|
||||
{
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
uint64_t allocated = 0;
|
||||
uint64_t allocating = 0;
|
||||
uint64_t sm_free_space, msp_free_space;
|
||||
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
ASSERT(!msp->ms_condensing);
|
||||
|
||||
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* We can only verify the metaslab space when we're called
|
||||
* from syncing context with a loaded metaslab that has an allocated
|
||||
* space map. Calling this in non-syncing context does not
|
||||
* provide a consistent view of the metaslab since we're performing
|
||||
* allocations in the future.
|
||||
* from syncing context with a loaded metaslab that has an
|
||||
* allocated space map. Calling this in non-syncing context
|
||||
* does not provide a consistent view of the metaslab since
|
||||
* we're performing allocations in the future.
|
||||
*/
|
||||
if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
|
||||
!msp->ms_loaded)
|
||||
return;
|
||||
|
||||
sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
|
||||
space_map_alloc_delta(msp->ms_sm);
|
||||
/*
|
||||
* Even though the smp_alloc field can get negative (e.g.
|
||||
* see vdev_checkpoint_sm), that should never be the case
|
||||
* when it come's to a metaslab's space map.
|
||||
*/
|
||||
ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
|
||||
|
||||
sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
|
||||
|
||||
/*
|
||||
* Account for future allocations since we would have already
|
||||
* deducted that space from the ms_freetree.
|
||||
* Account for future allocations since we would have
|
||||
* already deducted that space from the ms_allocatable.
|
||||
*/
|
||||
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
|
||||
allocated +=
|
||||
allocating +=
|
||||
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
|
||||
}
|
||||
|
||||
msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
|
||||
ASSERT3U(msp->ms_deferspace, ==,
|
||||
range_tree_space(msp->ms_defer[0]) +
|
||||
range_tree_space(msp->ms_defer[1]));
|
||||
|
||||
msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
|
||||
msp->ms_deferspace + range_tree_space(msp->ms_freed);
|
||||
|
||||
VERIFY3U(sm_free_space, ==, msp_free_space);
|
||||
@ -929,6 +946,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
|
||||
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
ASSERT(msp != NULL);
|
||||
|
||||
/* skip if not active or not a member */
|
||||
if (msp->ms_sm == NULL || msp->ms_group != mg)
|
||||
@ -1470,6 +1488,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
|
||||
* ==========================================================================
|
||||
*/
|
||||
|
||||
static void
|
||||
metaslab_aux_histograms_clear(metaslab_t *msp)
|
||||
{
|
||||
/*
|
||||
* Auxiliary histograms are only cleared when resetting them,
|
||||
* which can only happen while the metaslab is loaded.
|
||||
*/
|
||||
ASSERT(msp->ms_loaded);
|
||||
|
||||
bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++)
|
||||
bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
|
||||
range_tree_t *rt)
|
||||
{
|
||||
/*
|
||||
* This is modeled after space_map_histogram_add(), so refer to that
|
||||
* function for implementation details. We want this to work like
|
||||
* the space map histogram, and not the range tree histogram, as we
|
||||
* are essentially constructing a delta that will be later subtracted
|
||||
* from the space map histogram.
|
||||
*/
|
||||
int idx = 0;
|
||||
for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
|
||||
ASSERT3U(i, >=, idx + shift);
|
||||
histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
|
||||
|
||||
if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
|
||||
ASSERT3U(idx + shift, ==, i);
|
||||
idx++;
|
||||
ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Called at every sync pass that the metaslab gets synced.
|
||||
*
|
||||
* The reason is that we want our auxiliary histograms to be updated
|
||||
* wherever the metaslab's space map histogram is updated. This way
|
||||
* we stay consistent on which parts of the metaslab space map's
|
||||
* histogram are currently not available for allocations (e.g because
|
||||
* they are in the defer, freed, and freeing trees).
|
||||
*/
|
||||
static void
|
||||
metaslab_aux_histograms_update(metaslab_t *msp)
|
||||
{
|
||||
space_map_t *sm = msp->ms_sm;
|
||||
ASSERT(sm != NULL);
|
||||
|
||||
/*
|
||||
* This is similar to the metaslab's space map histogram updates
|
||||
* that take place in metaslab_sync(). The only difference is that
|
||||
* we only care about segments that haven't made it into the
|
||||
* ms_allocatable tree yet.
|
||||
*/
|
||||
if (msp->ms_loaded) {
|
||||
metaslab_aux_histograms_clear(msp);
|
||||
|
||||
metaslab_aux_histogram_add(msp->ms_synchist,
|
||||
sm->sm_shift, msp->ms_freed);
|
||||
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
metaslab_aux_histogram_add(msp->ms_deferhist[t],
|
||||
sm->sm_shift, msp->ms_defer[t]);
|
||||
}
|
||||
}
|
||||
|
||||
metaslab_aux_histogram_add(msp->ms_synchist,
|
||||
sm->sm_shift, msp->ms_freeing);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called every time we are done syncing (writing to) the metaslab,
|
||||
* i.e. at the end of each sync pass.
|
||||
* [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
|
||||
*/
|
||||
static void
|
||||
metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
|
||||
{
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
space_map_t *sm = msp->ms_sm;
|
||||
|
||||
if (sm == NULL) {
|
||||
/*
|
||||
* We came here from metaslab_init() when creating/opening a
|
||||
* pool, looking at a metaslab that hasn't had any allocations
|
||||
* yet.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is similar to the actions that we take for the ms_freed
|
||||
* and ms_defer trees in metaslab_sync_done().
|
||||
*/
|
||||
uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
|
||||
if (defer_allowed) {
|
||||
bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
|
||||
sizeof (msp->ms_synchist));
|
||||
} else {
|
||||
bzero(msp->ms_deferhist[hist_index],
|
||||
sizeof (msp->ms_deferhist[hist_index]));
|
||||
}
|
||||
bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the metaslab's weight and fragmentation are consistent
|
||||
* with the contents of the histogram (either the range tree's histogram
|
||||
* or the space map's depending whether the metaslab is loaded).
|
||||
*/
|
||||
static void
|
||||
metaslab_verify_weight_and_frag(metaslab_t *msp)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
|
||||
return;
|
||||
|
||||
/* see comment in metaslab_verify_unflushed_changes() */
|
||||
if (msp->ms_group == NULL)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Devices being removed always return a weight of 0 and leave
|
||||
* fragmentation and ms_max_size as is - there is nothing for
|
||||
* us to verify here.
|
||||
*/
|
||||
vdev_t *vd = msp->ms_group->mg_vd;
|
||||
if (vd->vdev_removing)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the metaslab is dirty it probably means that we've done
|
||||
* some allocations or frees that have changed our histograms
|
||||
* and thus the weight.
|
||||
*/
|
||||
for (int t = 0; t < TXG_SIZE; t++) {
|
||||
if (txg_list_member(&vd->vdev_ms_list, msp, t))
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* This verification checks that our in-memory state is consistent
|
||||
* with what's on disk. If the pool is read-only then there aren't
|
||||
* any changes and we just have the initially-loaded state.
|
||||
*/
|
||||
if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
|
||||
return;
|
||||
|
||||
/* some extra verification for in-core tree if you can */
|
||||
if (msp->ms_loaded) {
|
||||
range_tree_stat_verify(msp->ms_allocatable);
|
||||
VERIFY(space_map_histogram_verify(msp->ms_sm,
|
||||
msp->ms_allocatable));
|
||||
}
|
||||
|
||||
uint64_t weight = msp->ms_weight;
|
||||
uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
|
||||
boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
|
||||
uint64_t frag = msp->ms_fragmentation;
|
||||
uint64_t max_segsize = msp->ms_max_size;
|
||||
|
||||
msp->ms_weight = 0;
|
||||
msp->ms_fragmentation = 0;
|
||||
msp->ms_max_size = 0;
|
||||
|
||||
/*
|
||||
* This function is used for verification purposes. Regardless of
|
||||
* whether metaslab_weight() thinks this metaslab should be active or
|
||||
* not, we want to ensure that the actual weight (and therefore the
|
||||
* value of ms_weight) would be the same if it was to be recalculated
|
||||
* at this point.
|
||||
*/
|
||||
msp->ms_weight = metaslab_weight(msp) | was_active;
|
||||
|
||||
VERIFY3U(max_segsize, ==, msp->ms_max_size);
|
||||
|
||||
/*
|
||||
* If the weight type changed then there is no point in doing
|
||||
* verification. Revert fields to their original values.
|
||||
*/
|
||||
if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
|
||||
(!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
|
||||
msp->ms_fragmentation = frag;
|
||||
msp->ms_weight = weight;
|
||||
return;
|
||||
}
|
||||
|
||||
VERIFY3U(msp->ms_fragmentation, ==, frag);
|
||||
VERIFY3U(msp->ms_weight, ==, weight);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for any in-progress metaslab loads to complete.
|
||||
*/
|
||||
@ -1491,47 +1706,94 @@ metaslab_load_impl(metaslab_t *msp)
|
||||
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
ASSERT(msp->ms_loading);
|
||||
ASSERT(!msp->ms_condensing);
|
||||
|
||||
/*
|
||||
* Nobody else can manipulate a loading metaslab, so it's now safe
|
||||
* to drop the lock. This way we don't have to hold the lock while
|
||||
* reading the spacemap from disk.
|
||||
* We temporarily drop the lock to unblock other operations while we
|
||||
* are reading the space map. Therefore, metaslab_sync() and
|
||||
* metaslab_sync_done() can run at the same time as we do.
|
||||
*
|
||||
* metaslab_sync() can append to the space map while we are loading.
|
||||
* Therefore we load only entries that existed when we started the
|
||||
* load. Additionally, metaslab_sync_done() has to wait for the load
|
||||
* to complete because there are potential races like metaslab_load()
|
||||
* loading parts of the space map that are currently being appended
|
||||
* by metaslab_sync(). If we didn't, the ms_allocatable would have
|
||||
* entries that metaslab_sync_done() would try to re-add later.
|
||||
*
|
||||
* That's why before dropping the lock we remember the synced length
|
||||
* of the metaslab and read up to that point of the space map,
|
||||
* ignoring entries appended by metaslab_sync() that happen after we
|
||||
* drop the lock.
|
||||
*/
|
||||
uint64_t length = msp->ms_synced_length;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
|
||||
/*
|
||||
* If the space map has not been allocated yet, then treat
|
||||
* all the space in the metaslab as free and add it to ms_allocatable.
|
||||
*/
|
||||
if (msp->ms_sm != NULL) {
|
||||
error = space_map_load(msp->ms_sm, msp->ms_allocatable,
|
||||
SM_FREE);
|
||||
error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
|
||||
SM_FREE, length);
|
||||
} else {
|
||||
/*
|
||||
* The space map has not been allocated yet, so treat
|
||||
* all the space in the metaslab as free and add it to the
|
||||
* ms_allocatable tree.
|
||||
*/
|
||||
range_tree_add(msp->ms_allocatable,
|
||||
msp->ms_start, msp->ms_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to grab the ms_sync_lock to prevent metaslab_sync() from
|
||||
* changing the ms_sm and the metaslab's range trees while we are
|
||||
* about to use them and populate the ms_allocatable. The ms_lock
|
||||
* is insufficient for this because metaslab_sync() doesn't hold
|
||||
* the ms_lock while writing the ms_checkpointing tree to disk.
|
||||
*/
|
||||
mutex_enter(&msp->ms_sync_lock);
|
||||
mutex_enter(&msp->ms_lock);
|
||||
ASSERT(!msp->ms_condensing);
|
||||
|
||||
if (error != 0)
|
||||
if (error != 0) {
|
||||
mutex_exit(&msp->ms_sync_lock);
|
||||
return (error);
|
||||
}
|
||||
|
||||
ASSERT3P(msp->ms_group, !=, NULL);
|
||||
msp->ms_loaded = B_TRUE;
|
||||
|
||||
/*
|
||||
* If the metaslab already has a spacemap, then we need to
|
||||
* remove all segments from the defer tree; otherwise, the
|
||||
* metaslab is completely empty and we can skip this.
|
||||
* The ms_allocatable contains the segments that exist in the
|
||||
* ms_defer trees [see ms_synced_length]. Thus we need to remove
|
||||
* them from ms_allocatable as they will be added again in
|
||||
* metaslab_sync_done().
|
||||
*/
|
||||
if (msp->ms_sm != NULL) {
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
range_tree_walk(msp->ms_defer[t],
|
||||
range_tree_remove, msp->ms_allocatable);
|
||||
}
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
range_tree_walk(msp->ms_defer[t],
|
||||
range_tree_remove, msp->ms_allocatable);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call metaslab_recalculate_weight_and_sort() now that the
|
||||
* metaslab is loaded so we get the metaslab's real weight.
|
||||
*
|
||||
* Unless this metaslab was created with older software and
|
||||
* has not yet been converted to use segment-based weight, we
|
||||
* expect the new weight to be better or equal to the weight
|
||||
* that the metaslab had while it was not loaded. This is
|
||||
* because the old weight does not take into account the
|
||||
* consolidation of adjacent segments between TXGs. [see
|
||||
* comment for ms_synchist and ms_deferhist[] for more info]
|
||||
*/
|
||||
uint64_t weight = msp->ms_weight;
|
||||
metaslab_recalculate_weight_and_sort(msp);
|
||||
if (!WEIGHT_IS_SPACEBASED(weight))
|
||||
ASSERT3U(weight, <=, msp->ms_weight);
|
||||
msp->ms_max_size = metaslab_block_maxsize(msp);
|
||||
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
metaslab_verify_space(msp, spa_syncing_txg(spa));
|
||||
mutex_exit(&msp->ms_sync_lock);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -1548,6 +1810,7 @@ metaslab_load(metaslab_t *msp)
|
||||
if (msp->ms_loaded)
|
||||
return (0);
|
||||
VERIFY(!msp->ms_loading);
|
||||
ASSERT(!msp->ms_condensing);
|
||||
|
||||
msp->ms_loading = B_TRUE;
|
||||
int error = metaslab_load_impl(msp);
|
||||
@ -1561,10 +1824,29 @@ void
|
||||
metaslab_unload(metaslab_t *msp)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
metaslab_verify_weight_and_frag(msp);
|
||||
|
||||
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
|
||||
msp->ms_loaded = B_FALSE;
|
||||
|
||||
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
|
||||
msp->ms_max_size = 0;
|
||||
|
||||
/*
|
||||
* We explicitly recalculate the metaslab's weight based on its space
|
||||
* map (as it is now not loaded). We want unload metaslabs to always
|
||||
* have their weights calculated from the space map histograms, while
|
||||
* loaded ones have it calculated from their in-core range tree
|
||||
* [see metaslab_load()]. This way, the weight reflects the information
|
||||
* available in-core, whether it is loaded or not
|
||||
*
|
||||
* If ms_group == NULL means that we came here from metaslab_fini(),
|
||||
* at which point it doesn't make sense for us to do the recalculation
|
||||
* and the sorting.
|
||||
*/
|
||||
if (msp->ms_group != NULL)
|
||||
metaslab_recalculate_weight_and_sort(msp);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -1604,6 +1886,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
/*
|
||||
* We only open space map objects that already exist. All others
|
||||
* will be opened when we finally allocate an object for it.
|
||||
*
|
||||
* Note:
|
||||
* When called from vdev_expand(), we can't call into the DMU as
|
||||
* we are holding the spa_config_lock as a writer and we would
|
||||
* deadlock [see relevant comment in vdev_metaslab_init()]. in
|
||||
* that case, the object parameter is zero though, so we won't
|
||||
* call into the DMU.
|
||||
*/
|
||||
if (object != 0) {
|
||||
error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
|
||||
@ -1615,14 +1904,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
}
|
||||
|
||||
ASSERT(ms->ms_sm != NULL);
|
||||
ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
|
||||
ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
|
||||
}
|
||||
|
||||
/*
|
||||
* We create the main range tree here, but we don't create the
|
||||
* We create the ms_allocatable here, but we don't create the
|
||||
* other range trees until metaslab_sync_done(). This serves
|
||||
* two purposes: it allows metaslab_sync_done() to detect the
|
||||
* addition of new space; and for debugging, it ensures that we'd
|
||||
* data fault on any attempt to use this metaslab before it's ready.
|
||||
* addition of new space; and for debugging, it ensures that
|
||||
* we'd data fault on any attempt to use this metaslab before
|
||||
* it's ready.
|
||||
*/
|
||||
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
|
||||
metaslab_rangesize_compare, 0);
|
||||
@ -1639,8 +1931,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
* out this txg. This ensures that we don't attempt to allocate
|
||||
* from it before we have initialized it completely.
|
||||
*/
|
||||
if (txg <= TXG_INITIAL)
|
||||
if (txg <= TXG_INITIAL) {
|
||||
metaslab_sync_done(ms, 0);
|
||||
metaslab_space_update(vd, mg->mg_class,
|
||||
metaslab_allocated_space(ms), 0, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* If metaslab_debug_load is set and we're initializing a metaslab
|
||||
@ -1674,7 +1969,7 @@ metaslab_fini(metaslab_t *msp)
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY(msp->ms_group == NULL);
|
||||
metaslab_space_update(vd, mg->mg_class,
|
||||
-space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
|
||||
-metaslab_allocated_space(msp), 0, -msp->ms_size);
|
||||
|
||||
space_map_close(msp->ms_sm);
|
||||
|
||||
@ -1695,6 +1990,9 @@ metaslab_fini(metaslab_t *msp)
|
||||
|
||||
range_tree_destroy(msp->ms_checkpointing);
|
||||
|
||||
for (int t = 0; t < TXG_SIZE; t++)
|
||||
ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
|
||||
|
||||
mutex_exit(&msp->ms_lock);
|
||||
cv_destroy(&msp->ms_load_cv);
|
||||
mutex_destroy(&msp->ms_lock);
|
||||
@ -1710,7 +2008,7 @@ metaslab_fini(metaslab_t *msp)
|
||||
* This table defines a segment size based fragmentation metric that will
|
||||
* allow each metaslab to derive its own fragmentation value. This is done
|
||||
* by calculating the space in each bucket of the spacemap histogram and
|
||||
* multiplying that by the fragmetation metric in this table. Doing
|
||||
* multiplying that by the fragmentation metric in this table. Doing
|
||||
* this for all buckets and dividing it by the total amount of free
|
||||
* space in this metaslab (i.e. the total free space in all buckets) gives
|
||||
* us the fragmentation metric. This means that a high fragmentation metric
|
||||
@ -1745,10 +2043,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Calclate the metaslab's fragmentation metric. A return value
|
||||
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
|
||||
* not support this metric. Otherwise, the return value should be in the
|
||||
* range [0, 100].
|
||||
* Calculate the metaslab's fragmentation metric and set ms_fragmentation.
|
||||
* Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
|
||||
* been upgraded and does not support this metric. Otherwise, the return
|
||||
* value should be in the range [0, 100].
|
||||
*/
|
||||
static void
|
||||
metaslab_set_fragmentation(metaslab_t *msp)
|
||||
@ -1841,7 +2139,7 @@ metaslab_space_weight(metaslab_t *msp)
|
||||
/*
|
||||
* The baseline weight is the metaslab's free space.
|
||||
*/
|
||||
space = msp->ms_size - space_map_allocated(msp->ms_sm);
|
||||
space = msp->ms_size - metaslab_allocated_space(msp);
|
||||
|
||||
if (metaslab_fragmentation_factor_enabled &&
|
||||
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
|
||||
@ -1945,14 +2243,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
|
||||
static uint64_t
|
||||
metaslab_weight_from_spacemap(metaslab_t *msp)
|
||||
{
|
||||
uint64_t weight = 0;
|
||||
space_map_t *sm = msp->ms_sm;
|
||||
ASSERT(!msp->ms_loaded);
|
||||
ASSERT(sm != NULL);
|
||||
ASSERT3U(space_map_object(sm), !=, 0);
|
||||
ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
|
||||
|
||||
/*
|
||||
* Create a joint histogram from all the segments that have made
|
||||
* it to the metaslab's space map histogram, that are not yet
|
||||
* available for allocation because they are still in the freeing
|
||||
* pipeline (e.g. freeing, freed, and defer trees). Then subtract
|
||||
* these segments from the space map's histogram to get a more
|
||||
* accurate weight.
|
||||
*/
|
||||
uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
|
||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
|
||||
deferspace_histogram[i] += msp->ms_synchist[i];
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
|
||||
deferspace_histogram[i] += msp->ms_deferhist[t][i];
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t weight = 0;
|
||||
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
|
||||
if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
|
||||
WEIGHT_SET_COUNT(weight,
|
||||
msp->ms_sm->sm_phys->smp_histogram[i]);
|
||||
WEIGHT_SET_INDEX(weight, i +
|
||||
msp->ms_sm->sm_shift);
|
||||
ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
|
||||
deferspace_histogram[i]);
|
||||
uint64_t count =
|
||||
sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
|
||||
if (count != 0) {
|
||||
WEIGHT_SET_COUNT(weight, count);
|
||||
WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
|
||||
WEIGHT_SET_ACTIVE(weight, 0);
|
||||
break;
|
||||
}
|
||||
@ -1977,7 +2299,7 @@ metaslab_segment_weight(metaslab_t *msp)
|
||||
/*
|
||||
* The metaslab is completely free.
|
||||
*/
|
||||
if (space_map_allocated(msp->ms_sm) == 0) {
|
||||
if (metaslab_allocated_space(msp) == 0) {
|
||||
int idx = highbit64(msp->ms_size) - 1;
|
||||
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
|
||||
|
||||
@ -1999,7 +2321,7 @@ metaslab_segment_weight(metaslab_t *msp)
|
||||
/*
|
||||
* If the metaslab is fully allocated then just make the weight 0.
|
||||
*/
|
||||
if (space_map_allocated(msp->ms_sm) == msp->ms_size)
|
||||
if (metaslab_allocated_space(msp) == msp->ms_size)
|
||||
return (0);
|
||||
/*
|
||||
* If the metaslab is already loaded, then use the range tree to
|
||||
@ -2080,6 +2402,8 @@ metaslab_weight(metaslab_t *msp)
|
||||
*/
|
||||
if (msp->ms_loaded)
|
||||
msp->ms_max_size = metaslab_block_maxsize(msp);
|
||||
else
|
||||
ASSERT0(msp->ms_max_size);
|
||||
|
||||
/*
|
||||
* Segment-based weighting requires space map histogram support.
|
||||
@ -2095,6 +2419,15 @@ metaslab_weight(metaslab_t *msp)
|
||||
return (weight);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_recalculate_weight_and_sort(metaslab_t *msp)
|
||||
{
|
||||
/* note: we preserve the mask (e.g. indication of primary, etc..) */
|
||||
uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
|
||||
metaslab_group_sort(msp->ms_group, msp,
|
||||
metaslab_weight(msp) | was_active);
|
||||
}
|
||||
|
||||
static int
|
||||
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
||||
int allocator, uint64_t activation_weight)
|
||||
@ -2479,17 +2812,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
VERIFY(txg <= spa_final_dirty_txg(spa));
|
||||
|
||||
/*
|
||||
* The only state that can actually be changing concurrently with
|
||||
* metaslab_sync() is the metaslab's ms_allocatable. No other
|
||||
* thread can be modifying this txg's alloc, freeing,
|
||||
* The only state that can actually be changing concurrently
|
||||
* with metaslab_sync() is the metaslab's ms_allocatable. No
|
||||
* other thread can be modifying this txg's alloc, freeing,
|
||||
* freed, or space_map_phys_t. We drop ms_lock whenever we
|
||||
* could call into the DMU, because the DMU can call down to us
|
||||
* (e.g. via zio_free()) at any time.
|
||||
* could call into the DMU, because the DMU can call down to
|
||||
* us (e.g. via zio_free()) at any time.
|
||||
*
|
||||
* The spa_vdev_remove_thread() can be reading metaslab state
|
||||
* concurrently, and it is locked out by the ms_sync_lock. Note
|
||||
* that the ms_lock is insufficient for this, because it is dropped
|
||||
* by space_map_write().
|
||||
* concurrently, and it is locked out by the ms_sync_lock.
|
||||
* Note that the ms_lock is insufficient for this, because it
|
||||
* is dropped by space_map_write().
|
||||
*/
|
||||
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
|
||||
|
||||
@ -2501,7 +2834,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
|
||||
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
|
||||
msp->ms_start, msp->ms_size, vd->vdev_ashift));
|
||||
|
||||
ASSERT(msp->ms_sm != NULL);
|
||||
ASSERT0(metaslab_allocated_space(msp));
|
||||
}
|
||||
|
||||
if (!range_tree_is_empty(msp->ms_checkpointing) &&
|
||||
@ -2549,6 +2884,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
mutex_enter(&msp->ms_lock);
|
||||
}
|
||||
|
||||
msp->ms_allocated_space += range_tree_space(alloctree);
|
||||
ASSERT3U(msp->ms_allocated_space, >=,
|
||||
range_tree_space(msp->ms_freeing));
|
||||
msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
|
||||
|
||||
if (!range_tree_is_empty(msp->ms_checkpointing)) {
|
||||
ASSERT(spa_has_checkpoint(spa));
|
||||
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
|
||||
@ -2562,14 +2902,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
space_map_write(vd->vdev_checkpoint_sm,
|
||||
msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
|
||||
mutex_enter(&msp->ms_lock);
|
||||
space_map_update(vd->vdev_checkpoint_sm);
|
||||
|
||||
spa->spa_checkpoint_info.sci_dspace +=
|
||||
range_tree_space(msp->ms_checkpointing);
|
||||
vd->vdev_stat.vs_checkpoint_space +=
|
||||
range_tree_space(msp->ms_checkpointing);
|
||||
ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
|
||||
-vd->vdev_checkpoint_sm->sm_alloc);
|
||||
-space_map_allocated(vd->vdev_checkpoint_sm));
|
||||
|
||||
range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
|
||||
}
|
||||
@ -2614,6 +2953,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
* time we load the space map.
|
||||
*/
|
||||
space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
|
||||
metaslab_aux_histograms_update(msp);
|
||||
|
||||
metaslab_group_histogram_add(mg, msp);
|
||||
metaslab_group_histogram_verify(mg);
|
||||
@ -2621,16 +2961,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
|
||||
/*
|
||||
* For sync pass 1, we avoid traversing this txg's free range tree
|
||||
* and instead will just swap the pointers for freeing and
|
||||
* freed. We can safely do this since the freed_tree is
|
||||
* guaranteed to be empty on the initial pass.
|
||||
* and instead will just swap the pointers for freeing and freed.
|
||||
* We can safely do this since the freed_tree is guaranteed to be
|
||||
* empty on the initial pass.
|
||||
*/
|
||||
if (spa_sync_pass(spa) == 1) {
|
||||
range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
|
||||
ASSERT0(msp->ms_allocated_this_txg);
|
||||
} else {
|
||||
range_tree_vacate(msp->ms_freeing,
|
||||
range_tree_add, msp->ms_freed);
|
||||
}
|
||||
msp->ms_allocated_this_txg += range_tree_space(alloctree);
|
||||
range_tree_vacate(alloctree, NULL, NULL);
|
||||
|
||||
ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
|
||||
@ -2708,7 +3050,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
}
|
||||
|
||||
defer_delta = 0;
|
||||
alloc_delta = space_map_alloc_delta(msp->ms_sm);
|
||||
alloc_delta = msp->ms_allocated_this_txg -
|
||||
range_tree_space(msp->ms_freed);
|
||||
if (defer_allowed) {
|
||||
defer_delta = range_tree_space(msp->ms_freed) -
|
||||
range_tree_space(*defer_tree);
|
||||
@ -2740,7 +3083,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
msp->ms_loaded ? range_tree_add : NULL,
|
||||
msp->ms_allocatable);
|
||||
}
|
||||
space_map_update(msp->ms_sm);
|
||||
|
||||
msp->ms_synced_length = space_map_length(msp->ms_sm);
|
||||
|
||||
msp->ms_deferspace += defer_delta;
|
||||
ASSERT3S(msp->ms_deferspace, >=, 0);
|
||||
@ -2752,6 +3096,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
*/
|
||||
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
|
||||
}
|
||||
metaslab_aux_histograms_update_done(msp, defer_allowed);
|
||||
|
||||
if (msp->ms_new) {
|
||||
msp->ms_new = B_FALSE;
|
||||
@ -2759,12 +3104,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
mg->mg_ms_ready++;
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the new weights before unloading any metaslabs.
|
||||
* This will give us the most accurate weighting.
|
||||
* Re-sort metaslab within its group now that we've adjusted
|
||||
* its allocatable space.
|
||||
*/
|
||||
metaslab_group_sort(mg, msp, metaslab_weight(msp) |
|
||||
(msp->ms_weight & METASLAB_ACTIVE_MASK));
|
||||
metaslab_recalculate_weight_and_sort(msp);
|
||||
|
||||
/*
|
||||
* If the metaslab is loaded and we've not tried to load or allocate
|
||||
@ -2791,6 +3136,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
ASSERT0(range_tree_space(msp->ms_freed));
|
||||
ASSERT0(range_tree_space(msp->ms_checkpointing));
|
||||
|
||||
msp->ms_allocated_this_txg = 0;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
@ -4073,7 +4419,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
||||
zio_alloc_list_t *zal, zio_t *zio, int allocator)
|
||||
{
|
||||
dva_t *dva = bp->blk_dva;
|
||||
dva_t *hintdva = hintbp->blk_dva;
|
||||
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
|
||||
int error = 0;
|
||||
|
||||
ASSERT(bp->blk_birth == 0);
|
||||
@ -4240,14 +4586,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
|
||||
msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
if (msp->ms_loaded)
|
||||
range_tree_verify(msp->ms_allocatable, offset, size);
|
||||
if (msp->ms_loaded) {
|
||||
range_tree_verify_not_present(msp->ms_allocatable,
|
||||
offset, size);
|
||||
}
|
||||
|
||||
range_tree_verify(msp->ms_freeing, offset, size);
|
||||
range_tree_verify(msp->ms_checkpointing, offset, size);
|
||||
range_tree_verify(msp->ms_freed, offset, size);
|
||||
range_tree_verify_not_present(msp->ms_freeing, offset, size);
|
||||
range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
|
||||
range_tree_verify_not_present(msp->ms_freed, offset, size);
|
||||
for (int j = 0; j < TXG_DEFER_SIZE; j++)
|
||||
range_tree_verify(msp->ms_defer[j], offset, size);
|
||||
range_tree_verify_not_present(msp->ms_defer[j], offset, size);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
|
@ -511,13 +511,11 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
|
||||
range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
|
||||
{
|
||||
range_seg_t *rs;
|
||||
|
||||
rs = range_tree_find(rt, off, size);
|
||||
range_seg_t *rs = range_tree_find(rt, off, size);
|
||||
if (rs != NULL)
|
||||
panic("freeing free block; rs=%p", (void *)rs);
|
||||
panic("segment already in tree; rs=%p", (void *)rs);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
|
@ -129,7 +129,7 @@
|
||||
* uberblock would reference data in the removed device. For this reason
|
||||
* and others of similar nature, we disallow the following operations that
|
||||
* can change the config:
|
||||
* vdev removal and attach/detach, mirror splitting, and pool reguid.
|
||||
* vdev removal and attach/detach, mirror splitting, and pool reguid.
|
||||
*
|
||||
* - As most of the checkpoint logic is implemented in the SPA and doesn't
|
||||
* distinguish datasets when it comes to space accounting, having a
|
||||
@ -262,7 +262,7 @@ spa_checkpoint_accounting_verify(spa_t *spa)
|
||||
|
||||
if (vd->vdev_checkpoint_sm != NULL) {
|
||||
ckpoint_sm_space_sum +=
|
||||
-vd->vdev_checkpoint_sm->sm_alloc;
|
||||
-space_map_allocated(vd->vdev_checkpoint_sm);
|
||||
vs_ckpoint_space_sum +=
|
||||
vd->vdev_stat.vs_checkpoint_space;
|
||||
ASSERT3U(ckpoint_sm_space_sum, ==,
|
||||
@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
|
||||
error, vd->vdev_id);
|
||||
}
|
||||
ASSERT0(words_after);
|
||||
ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
|
||||
ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
|
||||
ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
|
||||
|
||||
space_map_free(vd->vdev_checkpoint_sm, tx);
|
||||
|
@ -23,7 +23,7 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -86,20 +86,22 @@ sm_entry_is_double_word(uint64_t e)
|
||||
|
||||
/*
|
||||
* Iterate through the space map, invoking the callback on each (non-debug)
|
||||
* space map entry.
|
||||
* space map entry. Stop after reading 'end' bytes of the space map.
|
||||
*/
|
||||
int
|
||||
space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
|
||||
space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
|
||||
{
|
||||
uint64_t sm_len = space_map_length(sm);
|
||||
ASSERT3U(sm->sm_blksz, !=, 0);
|
||||
uint64_t blksz = sm->sm_blksz;
|
||||
|
||||
dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
|
||||
ASSERT3U(blksz, !=, 0);
|
||||
ASSERT3U(end, <=, space_map_length(sm));
|
||||
ASSERT0(P2PHASE(end, sizeof (uint64_t)));
|
||||
|
||||
dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
|
||||
ZIO_PRIORITY_SYNC_READ);
|
||||
|
||||
uint64_t blksz = sm->sm_blksz;
|
||||
int error = 0;
|
||||
for (uint64_t block_base = 0; block_base < sm_len && error == 0;
|
||||
for (uint64_t block_base = 0; block_base < end && error == 0;
|
||||
block_base += blksz) {
|
||||
dmu_buf_t *db;
|
||||
error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
|
||||
@ -108,7 +110,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
|
||||
return (error);
|
||||
|
||||
uint64_t *block_start = db->db_data;
|
||||
uint64_t block_length = MIN(sm_len - block_base, blksz);
|
||||
uint64_t block_length = MIN(end - block_base, blksz);
|
||||
uint64_t *block_end = block_start +
|
||||
(block_length / sizeof (uint64_t));
|
||||
|
||||
@ -191,7 +193,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
|
||||
* dmu_buf_hold().
|
||||
*/
|
||||
uint64_t last_word_offset =
|
||||
sm->sm_phys->smp_objsize - sizeof (uint64_t);
|
||||
sm->sm_phys->smp_length - sizeof (uint64_t);
|
||||
error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
|
||||
FTAG, &db, DMU_READ_NO_PREFETCH);
|
||||
if (error != 0)
|
||||
@ -204,7 +206,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
|
||||
|
||||
uint64_t *words = db->db_data;
|
||||
*nwords =
|
||||
(sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
|
||||
(sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
|
||||
|
||||
ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
|
||||
|
||||
@ -303,8 +305,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
|
||||
uint64_t e = buf[i];
|
||||
|
||||
if (sm_entry_is_debug(e)) {
|
||||
sm->sm_phys->smp_objsize -= sizeof (uint64_t);
|
||||
space_map_update(sm);
|
||||
sm->sm_phys->smp_length -= sizeof (uint64_t);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -359,15 +360,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
|
||||
sm->sm_phys->smp_alloc -= entry_run;
|
||||
else
|
||||
sm->sm_phys->smp_alloc += entry_run;
|
||||
sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
|
||||
space_map_update(sm);
|
||||
sm->sm_phys->smp_length -= words * sizeof (uint64_t);
|
||||
}
|
||||
}
|
||||
|
||||
if (space_map_length(sm) == 0) {
|
||||
ASSERT0(error);
|
||||
ASSERT0(sm->sm_phys->smp_objsize);
|
||||
ASSERT0(sm->sm_alloc);
|
||||
ASSERT0(space_map_allocated(sm));
|
||||
}
|
||||
|
||||
zio_buf_free(buf, bufsz);
|
||||
@ -395,6 +394,33 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load the spacemap into the rangetree, like space_map_load. But only
|
||||
* read the first 'length' bytes of the spacemap.
|
||||
*/
|
||||
int
|
||||
space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
uint64_t length)
|
||||
{
|
||||
space_map_load_arg_t smla;
|
||||
|
||||
VERIFY0(range_tree_space(rt));
|
||||
|
||||
if (maptype == SM_FREE)
|
||||
range_tree_add(rt, sm->sm_start, sm->sm_size);
|
||||
|
||||
smla.smla_rt = rt;
|
||||
smla.smla_sm = sm;
|
||||
smla.smla_type = maptype;
|
||||
int err = space_map_iterate(sm, length,
|
||||
space_map_load_callback, &smla);
|
||||
|
||||
if (err != 0)
|
||||
range_tree_vacate(rt, NULL, NULL);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load the space map disk into the specified range tree. Segments of maptype
|
||||
* are added to the range tree, other segment types are removed.
|
||||
@ -402,30 +428,7 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
|
||||
int
|
||||
space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
|
||||
{
|
||||
uint64_t space;
|
||||
int err;
|
||||
space_map_load_arg_t smla;
|
||||
|
||||
VERIFY0(range_tree_space(rt));
|
||||
space = space_map_allocated(sm);
|
||||
|
||||
if (maptype == SM_FREE) {
|
||||
range_tree_add(rt, sm->sm_start, sm->sm_size);
|
||||
space = sm->sm_size - space;
|
||||
}
|
||||
|
||||
smla.smla_rt = rt;
|
||||
smla.smla_sm = sm;
|
||||
smla.smla_type = maptype;
|
||||
err = space_map_iterate(sm, space_map_load_callback, &smla);
|
||||
|
||||
if (err == 0) {
|
||||
VERIFY3U(range_tree_space(rt), ==, space);
|
||||
} else {
|
||||
range_tree_vacate(rt, NULL, NULL);
|
||||
}
|
||||
|
||||
return (err);
|
||||
return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
|
||||
}
|
||||
|
||||
void
|
||||
@ -511,10 +514,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
|
||||
SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
|
||||
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
|
||||
|
||||
dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
|
||||
dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
|
||||
sizeof (dentry), &dentry, tx);
|
||||
|
||||
sm->sm_phys->smp_objsize += sizeof (dentry);
|
||||
sm->sm_phys->smp_length += sizeof (dentry);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -546,7 +549,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
|
||||
uint64_t *block_base = db->db_data;
|
||||
uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
|
||||
uint64_t *block_cursor = block_base +
|
||||
(sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
|
||||
(sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
|
||||
|
||||
ASSERT3P(block_cursor, <=, block_end);
|
||||
|
||||
@ -569,7 +572,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
|
||||
if (block_cursor == block_end) {
|
||||
dmu_buf_rele(db, tag);
|
||||
|
||||
uint64_t next_word_offset = sm->sm_phys->smp_objsize;
|
||||
uint64_t next_word_offset = sm->sm_phys->smp_length;
|
||||
VERIFY0(dmu_buf_hold(sm->sm_os,
|
||||
space_map_object(sm), next_word_offset,
|
||||
tag, &db, DMU_READ_PREFETCH));
|
||||
@ -599,7 +602,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
|
||||
SM_DEBUG_SYNCPASS_ENCODE(0) |
|
||||
SM_DEBUG_TXG_ENCODE(0);
|
||||
block_cursor++;
|
||||
sm->sm_phys->smp_objsize += sizeof (uint64_t);
|
||||
sm->sm_phys->smp_length += sizeof (uint64_t);
|
||||
ASSERT3P(block_cursor, ==, block_end);
|
||||
continue;
|
||||
}
|
||||
@ -630,7 +633,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
|
||||
words);
|
||||
break;
|
||||
}
|
||||
sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
|
||||
sm->sm_phys->smp_length += words * sizeof (uint64_t);
|
||||
|
||||
start += run_len;
|
||||
size -= run_len;
|
||||
@ -657,7 +660,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
* We do this right after we write the intro debug entry
|
||||
* because the estimate does not take it into account.
|
||||
*/
|
||||
uint64_t initial_objsize = sm->sm_phys->smp_objsize;
|
||||
uint64_t initial_objsize = sm->sm_phys->smp_length;
|
||||
uint64_t estimated_growth =
|
||||
space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
|
||||
uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
|
||||
@ -668,7 +671,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
* and use that to get a hold of the last block, so we can
|
||||
* start appending to it.
|
||||
*/
|
||||
uint64_t next_word_offset = sm->sm_phys->smp_objsize;
|
||||
uint64_t next_word_offset = sm->sm_phys->smp_length;
|
||||
VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
|
||||
next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
|
||||
ASSERT3U(db->db_size, ==, sm->sm_blksz);
|
||||
@ -716,7 +719,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
* Therefore we expect the actual objsize to be equal or less
|
||||
* than whatever we estimated it to be.
|
||||
*/
|
||||
ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
|
||||
ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -872,23 +875,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
dmu_buf_will_dirty(sm->sm_dbuf, tx);
|
||||
sm->sm_phys->smp_objsize = 0;
|
||||
sm->sm_phys->smp_length = 0;
|
||||
sm->sm_phys->smp_alloc = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the in-core space_map allocation and length values.
|
||||
*/
|
||||
void
|
||||
space_map_update(space_map_t *sm)
|
||||
{
|
||||
if (sm == NULL)
|
||||
return;
|
||||
|
||||
sm->sm_alloc = sm->sm_phys->smp_alloc;
|
||||
sm->sm_length = sm->sm_phys->smp_objsize;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
|
||||
{
|
||||
@ -1070,32 +1060,14 @@ space_map_object(space_map_t *sm)
|
||||
return (sm != NULL ? sm->sm_object : 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the already synced, on-disk allocated space.
|
||||
*/
|
||||
uint64_t
|
||||
int64_t
|
||||
space_map_allocated(space_map_t *sm)
|
||||
{
|
||||
return (sm != NULL ? sm->sm_alloc : 0);
|
||||
return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the already synced, on-disk length;
|
||||
*/
|
||||
uint64_t
|
||||
space_map_length(space_map_t *sm)
|
||||
{
|
||||
return (sm != NULL ? sm->sm_length : 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the allocated space that is currently syncing.
|
||||
*/
|
||||
int64_t
|
||||
space_map_alloc_delta(space_map_t *sm)
|
||||
{
|
||||
if (sm == NULL)
|
||||
return (0);
|
||||
ASSERT(sm->sm_dbuf != NULL);
|
||||
return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
|
||||
return (sm != NULL ? sm->sm_phys->smp_length : 0);
|
||||
}
|
||||
|
@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *);
|
||||
int metaslab_load(metaslab_t *);
|
||||
void metaslab_unload(metaslab_t *);
|
||||
|
||||
uint64_t metaslab_allocated_space(metaslab_t *);
|
||||
|
||||
void metaslab_sync(metaslab_t *, uint64_t);
|
||||
void metaslab_sync_done(metaslab_t *, uint64_t);
|
||||
void metaslab_sync_reassess(metaslab_group_t *);
|
||||
@ -116,6 +118,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
|
||||
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
|
||||
boolean_t);
|
||||
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
|
||||
void metaslab_recalculate_weight_and_sort(metaslab_t *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -341,8 +341,34 @@ struct metaslab_group {
|
||||
* being written.
|
||||
*/
|
||||
struct metaslab {
|
||||
/*
|
||||
* This is the main lock of the metaslab and its purpose is to
|
||||
* coordinate our allocations and frees [e.g metaslab_block_alloc(),
|
||||
* metaslab_free_concrete(), ..etc] with our various syncing
|
||||
* procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
|
||||
*
|
||||
* The lock is also used during some miscellaneous operations like
|
||||
* using the metaslab's histogram for the metaslab group's histogram
|
||||
* aggregation, or marking the metaslab for initialization.
|
||||
*/
|
||||
kmutex_t ms_lock;
|
||||
|
||||
/*
|
||||
* Acquired together with the ms_lock whenever we expect to
|
||||
* write to metaslab data on-disk (i.e flushing entries to
|
||||
* the metaslab's space map). It helps coordinate readers of
|
||||
* the metaslab's space map [see spa_vdev_remove_thread()]
|
||||
* with writers [see metaslab_sync()].
|
||||
*
|
||||
* Note that metaslab_load(), even though a reader, uses
|
||||
* a completely different mechanism to deal with the reading
|
||||
* of the metaslab's space map based on ms_synced_length. That
|
||||
* said, the function still uses the ms_sync_lock after it
|
||||
* has read the ms_sm [see relevant comment in metaslab_load()
|
||||
* as to why].
|
||||
*/
|
||||
kmutex_t ms_sync_lock;
|
||||
|
||||
kcondvar_t ms_load_cv;
|
||||
space_map_t *ms_sm;
|
||||
uint64_t ms_id;
|
||||
@ -352,6 +378,7 @@ struct metaslab {
|
||||
|
||||
range_tree_t *ms_allocating[TXG_SIZE];
|
||||
range_tree_t *ms_allocatable;
|
||||
uint64_t ms_allocated_this_txg;
|
||||
|
||||
/*
|
||||
* The following range trees are accessed only from syncing context.
|
||||
@ -376,6 +403,55 @@ struct metaslab {
|
||||
boolean_t ms_loaded;
|
||||
boolean_t ms_loading;
|
||||
|
||||
/*
|
||||
* The following histograms count entries that are in the
|
||||
* metaslab's space map (and its histogram) but are not in
|
||||
* ms_allocatable yet, because they are in ms_freed, ms_freeing,
|
||||
* or ms_defer[].
|
||||
*
|
||||
* When the metaslab is not loaded, its ms_weight needs to
|
||||
* reflect what is allocatable (i.e. what will be part of
|
||||
* ms_allocatable if it is loaded). The weight is computed from
|
||||
* the spacemap histogram, but that includes ranges that are
|
||||
* not yet allocatable (because they are in ms_freed,
|
||||
* ms_freeing, or ms_defer[]). Therefore, when calculating the
|
||||
* weight, we need to remove those ranges.
|
||||
*
|
||||
* The ranges in the ms_freed and ms_defer[] range trees are all
|
||||
* present in the spacemap. However, the spacemap may have
|
||||
* multiple entries to represent a contiguous range, because it
|
||||
* is written across multiple sync passes, but the changes of
|
||||
* all sync passes are consolidated into the range trees.
|
||||
* Adjacent ranges that are freed in different sync passes of
|
||||
* one txg will be represented separately (as 2 or more entries)
|
||||
* in the space map (and its histogram), but these adjacent
|
||||
* ranges will be consolidated (represented as one entry) in the
|
||||
* ms_freed/ms_defer[] range trees (and their histograms).
|
||||
*
|
||||
* When calculating the weight, we can not simply subtract the
|
||||
* range trees' histograms from the spacemap's histogram,
|
||||
* because the range trees' histograms may have entries in
|
||||
* higher buckets than the spacemap, due to consolidation.
|
||||
* Instead we must subtract the exact entries that were added to
|
||||
* the spacemap's histogram. ms_synchist and ms_deferhist[]
|
||||
* represent these exact entries, so we can subtract them from
|
||||
* the spacemap's histogram when calculating ms_weight.
|
||||
*
|
||||
* ms_synchist represents the same ranges as ms_freeing +
|
||||
* ms_freed, but without consolidation across sync passes.
|
||||
*
|
||||
* ms_deferhist[i] represents the same ranges as ms_defer[i],
|
||||
* but without consolidation across sync passes.
|
||||
*/
|
||||
uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
|
||||
uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
|
||||
|
||||
/*
|
||||
* Tracks the exact amount of allocated space of this metaslab
|
||||
* (and specifically the metaslab's space map) up to the most
|
||||
* recently completed sync pass [see usage in metaslab_sync()].
|
||||
*/
|
||||
uint64_t ms_allocated_space;
|
||||
int64_t ms_deferspace; /* sum of ms_defermap[] space */
|
||||
uint64_t ms_weight; /* weight vs. others in group */
|
||||
uint64_t ms_activation_weight; /* activation weight */
|
||||
@ -412,6 +488,9 @@ struct metaslab {
|
||||
avl_node_t ms_group_node; /* node in metaslab group tree */
|
||||
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
|
||||
|
||||
/* updated every time we are done syncing the metaslab's space map */
|
||||
uint64_t ms_synced_length;
|
||||
|
||||
boolean_t ms_new;
|
||||
};
|
||||
|
||||
|
@ -87,12 +87,13 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
|
||||
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
|
||||
void range_tree_destroy(range_tree_t *rt);
|
||||
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
void range_tree_verify_not_present(range_tree_t *rt,
|
||||
uint64_t start, uint64_t size);
|
||||
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
||||
uint64_t newstart, uint64_t newsize);
|
||||
uint64_t range_tree_space(range_tree_t *rt);
|
||||
boolean_t range_tree_is_empty(range_tree_t *rt);
|
||||
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
|
||||
void range_tree_stat_verify(range_tree_t *rt);
|
||||
uint64_t range_tree_min(range_tree_t *rt);
|
||||
|
@ -24,7 +24,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPACE_MAP_H
|
||||
@ -55,10 +55,17 @@ extern "C" {
|
||||
* for backward compatibility.
|
||||
*/
|
||||
typedef struct space_map_phys {
|
||||
uint64_t smp_object; /* on-disk space map object */
|
||||
uint64_t smp_objsize; /* size of the object */
|
||||
int64_t smp_alloc; /* space allocated from the map */
|
||||
uint64_t smp_pad[5]; /* reserved */
|
||||
/* object number: not needed but kept for backwards compatibility */
|
||||
uint64_t smp_object;
|
||||
|
||||
/* length of the object in bytes */
|
||||
uint64_t smp_length;
|
||||
|
||||
/* space allocated from the map */
|
||||
int64_t smp_alloc;
|
||||
|
||||
/* reserved */
|
||||
uint64_t smp_pad[5];
|
||||
|
||||
/*
|
||||
* The smp_histogram maintains a histogram of free regions. Each
|
||||
@ -81,8 +88,6 @@ typedef struct space_map {
|
||||
uint64_t sm_start; /* start of map */
|
||||
uint64_t sm_size; /* size of map */
|
||||
uint8_t sm_shift; /* unit shift */
|
||||
uint64_t sm_length; /* synced length */
|
||||
int64_t sm_alloc; /* synced space allocated */
|
||||
objset_t *sm_os; /* objset for this map */
|
||||
uint64_t sm_object; /* object id for this map */
|
||||
uint32_t sm_blksz; /* block size for space map */
|
||||
@ -189,18 +194,20 @@ boolean_t sm_entry_is_double_word(uint64_t e);
|
||||
typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
|
||||
|
||||
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
|
||||
int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
|
||||
int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
uint64_t length);
|
||||
int space_map_iterate(space_map_t *sm, uint64_t length,
|
||||
sm_cb_t callback, void *arg);
|
||||
int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
|
||||
void space_map_histogram_clear(space_map_t *sm);
|
||||
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
void space_map_update(space_map_t *sm);
|
||||
|
||||
uint64_t space_map_object(space_map_t *sm);
|
||||
uint64_t space_map_allocated(space_map_t *sm);
|
||||
int64_t space_map_allocated(space_map_t *sm);
|
||||
uint64_t space_map_length(space_map_t *sm);
|
||||
|
||||
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||
@ -216,8 +223,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
|
||||
uint64_t start, uint64_t size, uint8_t shift);
|
||||
void space_map_close(space_map_t *sm);
|
||||
|
||||
int64_t space_map_alloc_delta(space_map_t *sm);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -268,7 +268,6 @@ struct vdev {
|
||||
uint64_t vdev_islog; /* is an intent log device */
|
||||
uint64_t vdev_removing; /* device is being removed? */
|
||||
boolean_t vdev_ishole; /* is a hole in the namespace */
|
||||
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
|
||||
uint64_t vdev_top_zap;
|
||||
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
|
||||
|
||||
@ -327,16 +326,6 @@ struct vdev {
|
||||
range_tree_t *vdev_obsolete_segments;
|
||||
space_map_t *vdev_obsolete_sm;
|
||||
|
||||
/*
|
||||
* The queue depth parameters determine how many async writes are
|
||||
* still pending (i.e. allocated but not yet issued to disk) per
|
||||
* top-level (vdev_async_write_queue_depth) and the maximum allowed
|
||||
* (vdev_max_async_write_queue_depth). These values only apply to
|
||||
* top-level vdevs.
|
||||
*/
|
||||
uint64_t vdev_async_write_queue_depth;
|
||||
uint64_t vdev_max_async_write_queue_depth;
|
||||
|
||||
/*
|
||||
* Protects the vdev_scan_io_queue field itself as well as the
|
||||
* structure's contents (when present).
|
||||
|
@ -630,7 +630,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
@ -1032,7 +1031,6 @@ vdev_free(vdev_t *vd)
|
||||
rw_destroy(&vd->vdev_indirect_rwlock);
|
||||
mutex_destroy(&vd->vdev_obsolete_lock);
|
||||
|
||||
mutex_destroy(&vd->vdev_queue_lock);
|
||||
mutex_destroy(&vd->vdev_dtl_lock);
|
||||
mutex_destroy(&vd->vdev_stat_lock);
|
||||
mutex_destroy(&vd->vdev_probe_lock);
|
||||
@ -1401,12 +1399,12 @@ vdev_metaslab_fini(vdev_t *vd)
|
||||
}
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
uint64_t count = vd->vdev_ms_count;
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
metaslab_group_passivate(mg);
|
||||
|
||||
metaslab_group_passivate(vd->vdev_mg);
|
||||
uint64_t count = vd->vdev_ms_count;
|
||||
for (uint64_t m = 0; m < count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp != NULL)
|
||||
metaslab_fini(msp);
|
||||
}
|
||||
@ -1414,6 +1412,9 @@ vdev_metaslab_fini(vdev_t *vd)
|
||||
vd->vdev_ms = NULL;
|
||||
|
||||
vd->vdev_ms_count = 0;
|
||||
|
||||
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
|
||||
ASSERT0(mg->mg_histogram[i]);
|
||||
}
|
||||
ASSERT0(vd->vdev_ms_count);
|
||||
}
|
||||
@ -2767,13 +2768,6 @@ vdev_dtl_load(vdev_t *vd)
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
|
||||
/*
|
||||
* Now that we've opened the space_map we need to update
|
||||
* the in-core DTL.
|
||||
*/
|
||||
space_map_update(vd->vdev_dtl_sm);
|
||||
|
||||
error = space_map_load(vd->vdev_dtl_sm,
|
||||
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
@ -2933,10 +2927,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
||||
}
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_update(vd->vdev_dtl_sm);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3079,7 +3069,10 @@ vdev_load(vdev_t *vd)
|
||||
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
|
||||
(u_longlong_t)vd->vdev_asize);
|
||||
return (SET_ERROR(ENXIO));
|
||||
} else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
|
||||
}
|
||||
|
||||
error = vdev_metaslab_init(vd, 0);
|
||||
if (error != 0) {
|
||||
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
|
||||
"[error=%d]", error);
|
||||
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
|
||||
@ -3093,9 +3086,10 @@ vdev_load(vdev_t *vd)
|
||||
ASSERT(vd->vdev_asize != 0);
|
||||
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
|
||||
|
||||
if ((error = space_map_open(&vd->vdev_checkpoint_sm,
|
||||
error = space_map_open(&vd->vdev_checkpoint_sm,
|
||||
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
|
||||
vd->vdev_ashift))) {
|
||||
vd->vdev_ashift);
|
||||
if (error != 0) {
|
||||
vdev_dbgmsg(vd, "vdev_load: space_map_open "
|
||||
"failed for checkpoint spacemap (obj %llu) "
|
||||
"[error=%d]",
|
||||
@ -3103,15 +3097,15 @@ vdev_load(vdev_t *vd)
|
||||
return (error);
|
||||
}
|
||||
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
|
||||
space_map_update(vd->vdev_checkpoint_sm);
|
||||
|
||||
/*
|
||||
* Since the checkpoint_sm contains free entries
|
||||
* exclusively we can use sm_alloc to indicate the
|
||||
* culmulative checkpointed space that has been freed.
|
||||
* exclusively we can use space_map_allocated() to
|
||||
* indicate the cumulative checkpointed space that
|
||||
* has been freed.
|
||||
*/
|
||||
vd->vdev_stat.vs_checkpoint_space =
|
||||
-vd->vdev_checkpoint_sm->sm_alloc;
|
||||
-space_map_allocated(vd->vdev_checkpoint_sm);
|
||||
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
|
||||
vd->vdev_stat.vs_checkpoint_space;
|
||||
}
|
||||
@ -3143,7 +3137,6 @@ vdev_load(vdev_t *vd)
|
||||
(u_longlong_t)obsolete_sm_object, error);
|
||||
return (error);
|
||||
}
|
||||
space_map_update(vd->vdev_obsolete_sm);
|
||||
}
|
||||
|
||||
return (0);
|
||||
@ -3230,47 +3223,6 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
|
||||
ASSERT(vd == vd->vdev_top);
|
||||
ASSERT3U(txg, ==, spa_syncing_txg(spa));
|
||||
|
||||
if (vd->vdev_ms != NULL) {
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp == NULL || msp->ms_sm == NULL)
|
||||
continue;
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
/*
|
||||
* If the metaslab was not loaded when the vdev
|
||||
* was removed then the histogram accounting may
|
||||
* not be accurate. Update the histogram information
|
||||
* here so that we ensure that the metaslab group
|
||||
* and metaslab class are up-to-date.
|
||||
*/
|
||||
metaslab_group_histogram_remove(mg, msp);
|
||||
|
||||
VERIFY0(space_map_allocated(msp->ms_sm));
|
||||
space_map_close(msp->ms_sm);
|
||||
msp->ms_sm = NULL;
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
if (vd->vdev_checkpoint_sm != NULL) {
|
||||
ASSERT(spa_has_checkpoint(spa));
|
||||
space_map_close(vd->vdev_checkpoint_sm);
|
||||
vd->vdev_checkpoint_sm = NULL;
|
||||
}
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
|
||||
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
|
||||
ASSERT0(mg->mg_histogram[i]);
|
||||
}
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
|
||||
|
||||
vdev_destroy_spacemaps(vd, tx);
|
||||
@ -3304,17 +3256,14 @@ vdev_sync(vdev_t *vd, uint64_t txg)
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
vdev_t *lvd;
|
||||
metaslab_t *msp;
|
||||
dmu_tx_t *tx;
|
||||
|
||||
ASSERT3U(txg, ==, spa->spa_syncing_txg);
|
||||
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
|
||||
dmu_tx_t *tx;
|
||||
|
||||
ASSERT(vd->vdev_removing ||
|
||||
vd->vdev_ops == &vdev_indirect_ops);
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
vdev_indirect_sync_obsolete(vd, tx);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
/*
|
||||
* If the vdev is indirect, it can't have dirty
|
||||
@ -3323,6 +3272,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
|
||||
if (vd->vdev_ops == &vdev_indirect_ops) {
|
||||
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
|
||||
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
|
||||
dmu_tx_commit(tx);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -3333,12 +3283,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
|
||||
!vd->vdev_removing) {
|
||||
ASSERT(vd == vd->vdev_top);
|
||||
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
|
||||
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
|
||||
ASSERT(vd->vdev_ms_array != 0);
|
||||
vdev_config_dirty(vd);
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
|
||||
@ -3357,6 +3305,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
|
||||
vdev_remove_empty_log(vd, txg);
|
||||
|
||||
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
@ -3586,8 +3535,6 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
|
||||
*/
|
||||
if (error == 0 &&
|
||||
tvd->vdev_checkpoint_sm != NULL) {
|
||||
ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
|
||||
!=, 0);
|
||||
error = ZFS_ERR_CHECKPOINT_EXISTS;
|
||||
}
|
||||
|
||||
|
@ -680,7 +680,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
|
||||
|
||||
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
|
||||
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
|
||||
space_map_update(prev_obsolete_sm);
|
||||
counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
|
||||
if (prev_obsolete_sm != NULL) {
|
||||
vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
|
||||
@ -831,7 +830,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
|
||||
VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
|
||||
spa->spa_meta_objset, obsolete_sm_object,
|
||||
0, vd->vdev_asize, 0));
|
||||
space_map_update(vd->vdev_obsolete_sm);
|
||||
}
|
||||
|
||||
ASSERT(vd->vdev_obsolete_sm != NULL);
|
||||
@ -840,7 +838,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
|
||||
|
||||
space_map_write(vd->vdev_obsolete_sm,
|
||||
vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
|
||||
space_map_update(vd->vdev_obsolete_sm);
|
||||
range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
|
||||
}
|
||||
|
||||
|
@ -557,6 +557,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
|
||||
losma.losma_counts = counts;
|
||||
losma.losma_vim = vim;
|
||||
VERIFY0(space_map_iterate(obsolete_space_sm,
|
||||
space_map_length(obsolete_space_sm),
|
||||
load_obsolete_sm_callback, &losma));
|
||||
}
|
||||
|
||||
|
@ -442,7 +442,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
uint64_t ms_free = msp->ms_size -
|
||||
space_map_allocated(msp->ms_sm);
|
||||
metaslab_allocated_space(msp);
|
||||
|
||||
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
|
||||
ms_free /= vd->vdev_top->vdev_children;
|
||||
|
@ -283,15 +283,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
|
||||
if (ms->ms_sm == NULL)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Sync tasks happen before metaslab_sync(), therefore
|
||||
* smp_alloc and sm_alloc must be the same.
|
||||
*/
|
||||
ASSERT3U(space_map_allocated(ms->ms_sm), ==,
|
||||
ms->ms_sm->sm_phys->smp_alloc);
|
||||
|
||||
spa->spa_removing_phys.sr_to_copy +=
|
||||
space_map_allocated(ms->ms_sm);
|
||||
metaslab_allocated_space(ms);
|
||||
|
||||
/*
|
||||
* Space which we are freeing this txg does not need to
|
||||
@ -1401,22 +1394,8 @@ spa_vdev_remove_thread(void *arg)
|
||||
* appropriate action (see free_from_removing_vdev()).
|
||||
*/
|
||||
if (msp->ms_sm != NULL) {
|
||||
space_map_t *sm = NULL;
|
||||
|
||||
/*
|
||||
* We have to open a new space map here, because
|
||||
* ms_sm's sm_length and sm_alloc may not reflect
|
||||
* what's in the object contents, if we are in between
|
||||
* metaslab_sync() and metaslab_sync_done().
|
||||
*/
|
||||
VERIFY0(space_map_open(&sm,
|
||||
spa->spa_dsl_pool->dp_meta_objset,
|
||||
msp->ms_sm->sm_object, msp->ms_sm->sm_start,
|
||||
msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
|
||||
space_map_update(sm);
|
||||
VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
|
||||
SM_ALLOC));
|
||||
space_map_close(sm);
|
||||
VERIFY0(space_map_load(msp->ms_sm,
|
||||
svr->svr_allocd_segs, SM_ALLOC));
|
||||
|
||||
range_tree_walk(msp->ms_freeing,
|
||||
range_tree_remove, svr->svr_allocd_segs);
|
||||
@ -1612,16 +1591,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
|
||||
ASSERT0(range_tree_space(msp->ms_freed));
|
||||
|
||||
if (msp->ms_sm != NULL) {
|
||||
/*
|
||||
* Assert that the in-core spacemap has the same
|
||||
* length as the on-disk one, so we can use the
|
||||
* existing in-core spacemap to load it from disk.
|
||||
*/
|
||||
ASSERT3U(msp->ms_sm->sm_alloc, ==,
|
||||
msp->ms_sm->sm_phys->smp_alloc);
|
||||
ASSERT3U(msp->ms_sm->sm_length, ==,
|
||||
msp->ms_sm->sm_phys->smp_objsize);
|
||||
|
||||
mutex_enter(&svr->svr_lock);
|
||||
VERIFY0(space_map_load(msp->ms_sm,
|
||||
svr->svr_allocd_segs, SM_ALLOC));
|
||||
@ -1714,9 +1683,6 @@ spa_vdev_remove_cancel(spa_t *spa)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called every sync pass of every txg if there's a svr.
|
||||
*/
|
||||
void
|
||||
svr_sync(spa_t *spa, dmu_tx_t *tx)
|
||||
{
|
||||
@ -1780,6 +1746,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
|
||||
|
||||
ASSERT(vd->vdev_islog);
|
||||
ASSERT(vd == vd->vdev_top);
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
/*
|
||||
* Stop allocating from this vdev.
|
||||
@ -1794,15 +1761,14 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
|
||||
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
|
||||
|
||||
/*
|
||||
* Evacuate the device. We don't hold the config lock as writer
|
||||
* since we need to do I/O but we do keep the
|
||||
* Evacuate the device. We don't hold the config lock as
|
||||
* writer since we need to do I/O but we do keep the
|
||||
* spa_namespace_lock held. Once this completes the device
|
||||
* should no longer have any blocks allocated on it.
|
||||
*/
|
||||
if (vd->vdev_islog) {
|
||||
if (vd->vdev_stat.vs_alloc != 0)
|
||||
error = spa_reset_logs(spa);
|
||||
}
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
if (vd->vdev_stat.vs_alloc != 0)
|
||||
error = spa_reset_logs(spa);
|
||||
|
||||
*txg = spa_vdev_config_enter(spa);
|
||||
|
||||
@ -1821,6 +1787,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
|
||||
vdev_dirty_leaves(vd, VDD_DTL, *txg);
|
||||
vdev_config_dirty(vd);
|
||||
|
||||
vdev_metaslab_fini(vd);
|
||||
|
||||
spa_history_log_internal(spa, "vdev remove", NULL,
|
||||
"%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
|
||||
(vd->vdev_path != NULL) ? vd->vdev_path : "-");
|
||||
@ -1850,6 +1818,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
|
||||
if (list_link_active(&vd->vdev_config_dirty_node))
|
||||
vdev_config_clean(vd);
|
||||
|
||||
ASSERT0(vd->vdev_stat.vs_alloc);
|
||||
|
||||
/*
|
||||
* Clean up the vdev namespace.
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user