1
0
mirror of https://git.FreeBSD.org/src.git synced 2025-01-27 16:39:08 +00:00

Add support for parallel pool exports

Changed spa_export_common() such that it no longer holds the
spa_namespace_lock for the entire duration and instead sets
spa_export_thread to indicate an import is in progress on the
spa.  This allows for an export to a diffent pool to proceed
in parallel while an export is still processing potentially
long operations like spa_unload_log_sm_flush_all().

Calls like spa_lookup() and spa_vdev_enter() that rely on
the spa_namespace_lock to serialize them against a concurrent
export, now wait for any in-progress export thread to complete
before proceeding.

The 'zpool import -a' sub-command also provides multi-threaded
support, using a thread pool to submit the exports in parallel.

Sponsored-By: Klara Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <gwilson@delphix.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #16153
This commit is contained in:
Don Brady 2024-05-02 19:28:10 +00:00 committed by Brian Behlendorf
parent abec7dcd30
commit 975a13259b
12 changed files with 373 additions and 33 deletions

View File

@ -2030,10 +2030,19 @@ zpool_do_destroy(int argc, char **argv)
}
typedef struct export_cbdata {
tpool_t *tpool;
pthread_mutex_t mnttab_lock;
boolean_t force;
boolean_t hardforce;
int retval;
} export_cbdata_t;
typedef struct {
char *aea_poolname;
export_cbdata_t *aea_cbdata;
} async_export_args_t;
/*
* Export one pool
*/
@ -2042,11 +2051,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data)
{
export_cbdata_t *cb = data;
if (zpool_disable_datasets(zhp, cb->force) != 0)
return (1);
/*
* zpool_disable_datasets() is not thread-safe for mnttab access.
* So we serialize access here for 'zpool export -a' parallel case.
*/
if (cb->tpool != NULL)
pthread_mutex_lock(&cb->mnttab_lock);
/* The history must be logged as part of the export */
log_history = B_FALSE;
int retval = zpool_disable_datasets(zhp, cb->force);
if (cb->tpool != NULL)
pthread_mutex_unlock(&cb->mnttab_lock);
if (retval)
return (1);
if (cb->hardforce) {
if (zpool_export_force(zhp, history_str) != 0)
@ -2058,6 +2076,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data)
return (0);
}
/*
* Asynchronous export request
*/
static void
zpool_export_task(void *arg)
{
async_export_args_t *aea = arg;
zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname);
if (zhp != NULL) {
int ret = zpool_export_one(zhp, aea->aea_cbdata);
if (ret != 0)
aea->aea_cbdata->retval = ret;
zpool_close(zhp);
} else {
aea->aea_cbdata->retval = 1;
}
free(aea->aea_poolname);
free(aea);
}
/*
* Process an export request in parallel
*/
static int
zpool_export_one_async(zpool_handle_t *zhp, void *data)
{
tpool_t *tpool = ((export_cbdata_t *)data)->tpool;
async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t));
/* save pool name since zhp will go out of scope */
aea->aea_poolname = strdup(zpool_get_name(zhp));
aea->aea_cbdata = data;
/* ship off actual export to another thread */
if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0)
return (errno); /* unlikely */
else
return (0);
}
/*
* zpool export [-f] <pool> ...
*
@ -2098,17 +2158,33 @@ zpool_do_export(int argc, char **argv)
cb.force = force;
cb.hardforce = hardforce;
cb.tpool = NULL;
cb.retval = 0;
argc -= optind;
argv += optind;
/* The history will be logged as part of the export itself */
log_history = B_FALSE;
if (do_all) {
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
return (for_each_pool(argc, argv, B_TRUE, NULL,
ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb));
cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN),
0, NULL);
pthread_mutex_init(&cb.mnttab_lock, NULL);
/* Asynchronously call zpool_export_one using thread pool */
ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL,
B_FALSE, zpool_export_one_async, &cb);
tpool_wait(cb.tpool);
tpool_destroy(cb.tpool);
(void) pthread_mutex_destroy(&cb.mnttab_lock);
return (ret | cb.retval);
}
/* check arguments */

View File

@ -243,6 +243,7 @@ struct spa {
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
boolean_t spa_is_exporting; /* true while exporting pool */
kthread_t *spa_export_thread; /* valid during pool export */
kthread_t *spa_load_thread; /* loading, no namespace lock */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */

View File

@ -8143,11 +8143,11 @@ l2arc_dev_get_next(void)
ASSERT3P(next, !=, NULL);
} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all);
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);
/* if we were unable to find any usable vdevs, return NULL */
if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all)
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
next = NULL;
l2arc_dev_last = next;

View File

@ -34,6 +34,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
* Copyright (c) 2024, Klara Inc.
*/
/*
@ -1991,7 +1992,8 @@ spa_destroy_aux_threads(spa_t *spa)
static void
spa_unload(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
spa_import_progress_remove(spa_guid(spa));
@ -6955,7 +6957,7 @@ static int
spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
int error;
int error = 0;
spa_t *spa;
hrtime_t export_start = gethrtime();
@ -6979,8 +6981,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa->spa_is_exporting = B_TRUE;
/*
* Put a hold on the pool, drop the namespace lock, stop async tasks,
* reacquire the namespace lock, and see if we can export.
* Put a hold on the pool, drop the namespace lock, stop async tasks
* and see if we can export.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@ -6990,10 +6992,18 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
mutex_exit(&spa_namespace_lock);
/*
* At this point we no longer hold the spa_namespace_lock and
* the spa_export_thread indicates that an export is in progress.
*/
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
goto export_spa;
/*
* The pool will be in core if it's openable, in which case we can
* modify its state. Objsets may be open only because they're dirty,
@ -7089,6 +7099,10 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (oldconfig && spa->spa_config)
*oldconfig = fnvlist_dup(spa->spa_config);
if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);
mutex_enter(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
@ -7100,17 +7114,25 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
* we make sure to reset the exporting flag.
*/
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
}
if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);
/*
* Wake up any waiters on spa_namespace_lock
* They need to re-attempt a spa_lookup()
*/
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (0);
fail:
mutex_enter(&spa_namespace_lock);
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
spa_async_resume(spa);
/* Wake up any waiters on spa_namespace_lock */
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (error);
}

View File

@ -27,7 +27,7 @@
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2023, Klara Inc.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
@ -82,8 +82,8 @@
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
* - Held for the duration of create/destroy/export
* - Held at the start and end of import
* - Held for the duration of create/destroy
* - Held at the start and end of import and export
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
@ -636,8 +636,14 @@ spa_lookup(const char *name)
if (spa == NULL)
return (NULL);
if (spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) {
/*
* Avoid racing with import/export, which don't hold the namespace
* lock for their entire duration.
*/
if ((spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) ||
(spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread)) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
goto retry;
}
@ -950,14 +956,15 @@ spa_open_ref(spa_t *spa, const void *tag)
/*
* Remove a reference to the given spa_t. Must have at least one reference, or
* have the namespace lock held.
* have the namespace lock held or be part of a pool import/export.
*/
void
spa_close(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
spa->spa_load_thread == curthread ||
spa->spa_export_thread == curthread);
(void) zfs_refcount_remove(&spa->spa_refcount, tag);
}
@ -977,13 +984,15 @@ spa_async_close(spa_t *spa, const void *tag)
/*
* Check to see if the spa refcount is zero. Must be called with
* spa_namespace_lock held. We really compare against spa_minref, which is the
* number of references acquired when opening a pool
* spa_namespace_lock held or be the spa export thread. We really
* compare against spa_minref, which is the number of references
* acquired when opening a pool
*/
boolean_t
spa_refcount_zero(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
}
@ -1231,6 +1240,21 @@ spa_vdev_enter(spa_t *spa)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
/*
* We have a reference on the spa and a spa export could be
* starting but no longer holding the spa_namespace_lock. So
* check if there is an export and if so wait. It will fail
* fast (EBUSY) since we are still holding a spa reference.
*
* Note that we can be woken by a different spa transitioning
* through an import/export, so we must wait for our condition
* to change before proceeding.
*/
while (spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
}
vdev_autotrim_stop_all(spa);
return (spa_vdev_config_enter(spa));
@ -1248,6 +1272,12 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
/* See comment in spa_vdev_enter() */
while (spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
}
vdev_autotrim_stop_all(spa);
if (guid != 0) {

View File

@ -682,7 +682,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_initialize_lock);
@ -724,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
if (vd_list == NULL) {
vdev_initialize_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@ -756,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
spa_t *spa = vd->vdev_spa;
list_t vd_list;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_initialize_node));

View File

@ -1087,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
if (vd == spa->spa_root_vdev) {
for (uint64_t i = 0; i < vd->vdev_children; i++)

View File

@ -1040,7 +1040,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_trim_lock);
@ -1079,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
if (vd_list == NULL) {
vdev_trim_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@ -1115,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
list_t vd_list;
vdev_t *vd_l2cache;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_trim_node));

View File

@ -430,7 +430,8 @@ tags = ['functional', 'cli_root', 'zpool_events']
[tests/functional/cli_root/zpool_export]
tests = ['zpool_export_001_pos', 'zpool_export_002_pos',
'zpool_export_003_neg', 'zpool_export_004_pos']
'zpool_export_003_neg', 'zpool_export_004_pos',
'zpool_export_parallel_pos', 'zpool_export_parallel_admin']
tags = ['functional', 'cli_root', 'zpool_export']
[tests/functional/cli_root/zpool_get]

View File

@ -1084,6 +1084,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_export/zpool_export_002_pos.ksh \
functional/cli_root/zpool_export/zpool_export_003_neg.ksh \
functional/cli_root/zpool_export/zpool_export_004_pos.ksh \
functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \
functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \
functional/cli_root/zpool_get/cleanup.ksh \
functional/cli_root/zpool_get/setup.ksh \
functional/cli_root/zpool_get/vdev_get_001_pos.ksh \

View File

@ -0,0 +1,72 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2024 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Verify that admin commands cannot race a pool export
#
# STRATEGY:
# 1. Create a pool
# 2. Import the pool with an injected delay in the background
# 3. Execute some admin commands against the pool
#
verify_runnable "global"
DEVICE_DIR=$TEST_BASE_DIR/dev_export-test
function cleanup
{
zinject -c all
poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
[[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR
}
log_assert "admin commands cannot race a pool export"
log_onexit cleanup
[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR
log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1
log_must zpool create -f $TESTPOOL1 mirror ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1
log_must zinject -P export -s 10 $TESTPOOL1
log_must zpool export $TESTPOOL1 &
zpool set comment=hello $TESTPOOL1
zpool reguid $TESTPOOL1 &
zpool split $TESTPOOL1 &
log_pass "admin commands cannot race a pool export"

View File

@ -0,0 +1,129 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2024 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
# test uses 8 vdevs
MAX_NUM=8
DEVICE_DIR=$TEST_BASE_DIR/dev_import-test
#
# DESCRIPTION:
# Verify that pool exports can occur in parallel
#
# STRATEGY:
# 1. Create 8 pools
# 2. Inject an export delay using zinject
# 3. Export half of the pools synchronously to baseline sequential cost
# 4. Export the other half asynchronously to demonstrate parallel savings
# 6. Import 4 pools
# 7. Test zpool export -a
#
verify_runnable "global"
#
# override the minimum sized vdevs
#
POOLNAME="test_pool"
function cleanup
{
zinject -c all
for i in {0..$(($MAX_NUM - 1))}; do
poolexists $POOLNAME-$i && destroy_pool $POOLNAME-$i
done
[[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR
}
log_assert "Pool exports can occur in parallel"
log_onexit cleanup
[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR
#
# Create some pools with export delay injectors
#
for i in {0..$(($MAX_NUM - 1))}; do
log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk$i
log_must zpool create $POOLNAME-$i $DEVICE_DIR/disk$i
log_must zinject -P export -s 8 $POOLNAME-$i
done
#
# Export half of the pools synchronously
#
SECONDS=0
for i in {0..3}; do
log_must zpool export $POOLNAME-$i
done
sequential_time=$SECONDS
log_note "sequentially exported 4 pools in $sequential_time seconds"
#
# Export half of the pools in parallel
#
SECONDS=0
for i in {4..7}; do
log_must zpool export $POOLNAME-$i &
done
wait
parallel_time=$SECONDS
log_note "asyncronously exported 4 pools in $parallel_time seconds"
log_must test $parallel_time -lt $(($sequential_time / 3))
#
# import 4 pools with export delay injectors
#
for i in {4..7}; do
log_must zpool import -d $DEVICE_DIR/disk$i $POOLNAME-$i
log_must zinject -P export -s 8 $POOLNAME-$i
done
#
# now test zpool export -a
#
SECONDS=0
log_must zpool export -a
parallel_time=$SECONDS
log_note "asyncronously exported 4 pools, using '-a', in $parallel_time seconds"
log_must test $parallel_time -lt $(($sequential_time / 3))
log_pass "Pool exports occur in parallel"