mirror of
https://git.FreeBSD.org/src.git
synced 2025-01-16 15:11:52 +00:00
9079 race condition in starting and ending condesing thread for indirect vdevs
illumos/illumos-gate@667ec66f1b The timeline of the race condition is the following: [1] Thread A is about to finish condesing the first vdev in spa_condense_indirect_thread(), so it calls the spa_condense_indirect_complete_sync() sync task which sets the spa_condensing_indirect field to NULL. Waiting for the sync task to finish, thread A sleeps until the txg is done. When this happens, thread A will acquire spa_async_lock and set spa_condense_thread to NULL. [2] While thread A waits for the txg to finish, thread B which is running spa_sync() checks whether it should condense the second vdev in vdev_indirect_should_condense() by checking the spa_condensing_indirect field which was set to NULL by spa_condense_indirect_thread() from thread A. So it goes on and tries to spawn a new condensing thread in spa_condense_indirect_start_sync() and the aforementioned assertions fails because thread A has not set spa_condense_thread to NULL (which is basically the last thing it does before returning). The main issue here is that we rely on both spa_condensing_indirect and spa_condense_thread to signify whether a condensing thread is running. Ideally we would only use one throughout the codebase. In addition, for managing spa_condense_thread we currently use spa_async_lock which basically tights condensing to scrubing when it comes to pausing and resuming those actions during spa export. Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Author: Serapheim Dimitropoulos <serapheim@delphix.com>
This commit is contained in:
parent
250699c304
commit
e7245cfc4c
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/vendor-sys/illumos/dist/; revision=329799
@ -1421,7 +1421,8 @@ ZFS_COMMON_OBJS += \
|
||||
zio_compress.o \
|
||||
zio_inject.o \
|
||||
zle.o \
|
||||
zrlock.o
|
||||
zrlock.o \
|
||||
zthr.o
|
||||
|
||||
ZFS_SHARED_OBJS += \
|
||||
zfeature_common.o \
|
||||
|
@ -1338,6 +1338,12 @@ spa_unload(spa_t *spa)
|
||||
spa->spa_vdev_removal = NULL;
|
||||
}
|
||||
|
||||
if (spa->spa_condense_zthr != NULL) {
|
||||
ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
|
||||
zthr_destroy(spa->spa_condense_zthr);
|
||||
spa->spa_condense_zthr = NULL;
|
||||
}
|
||||
|
||||
spa_condense_fini(spa);
|
||||
|
||||
bpobj_close(&spa->spa_deferred_bpobj);
|
||||
@ -2079,6 +2085,16 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
|
||||
return (SET_ERROR(err));
|
||||
}
|
||||
|
||||
static void
|
||||
spa_spawn_aux_threads(spa_t *spa)
|
||||
{
|
||||
ASSERT(spa_writeable(spa));
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
spa_start_indirect_condensing_thread(spa);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fix up config after a partly-completed split. This is done with the
|
||||
* ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
|
||||
@ -3485,18 +3501,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
|
||||
|
||||
ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
|
||||
|
||||
/*
|
||||
* We must check this before we start the sync thread, because
|
||||
* we only want to start a condense thread for condense
|
||||
* operations that were in progress when the pool was
|
||||
* imported. Once we start syncing, spa_sync() could
|
||||
* initiate a condense (and start a thread for it). In
|
||||
* that case it would be wrong to start a second
|
||||
* condense thread.
|
||||
*/
|
||||
boolean_t condense_in_progress =
|
||||
(spa->spa_condensing_indirect != NULL);
|
||||
|
||||
/*
|
||||
* Traverse the ZIL and claim all blocks.
|
||||
*/
|
||||
@ -3549,15 +3553,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
|
||||
*/
|
||||
dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
|
||||
|
||||
/*
|
||||
* Note: unlike condensing, we don't need an analogous
|
||||
* "removal_in_progress" dance because no other thread
|
||||
* can start a removal while we hold the spa_namespace_lock.
|
||||
*/
|
||||
spa_restart_removal(spa);
|
||||
|
||||
if (condense_in_progress)
|
||||
spa_condense_indirect_restart(spa);
|
||||
spa_spawn_aux_threads(spa);
|
||||
}
|
||||
|
||||
spa_load_note(spa, "LOADED");
|
||||
@ -4466,6 +4464,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
*/
|
||||
txg_wait_synced(spa->spa_dsl_pool, txg);
|
||||
|
||||
spa_spawn_aux_threads(spa);
|
||||
|
||||
spa_write_cachefile(spa, B_FALSE, B_TRUE);
|
||||
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
|
||||
|
||||
@ -6405,12 +6405,15 @@ spa_async_suspend(spa_t *spa)
|
||||
{
|
||||
mutex_enter(&spa->spa_async_lock);
|
||||
spa->spa_async_suspended++;
|
||||
while (spa->spa_async_thread != NULL ||
|
||||
spa->spa_condense_thread != NULL)
|
||||
while (spa->spa_async_thread != NULL)
|
||||
cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
|
||||
mutex_exit(&spa->spa_async_lock);
|
||||
|
||||
spa_vdev_remove_suspend(spa);
|
||||
|
||||
zthr_t *condense_thread = spa->spa_condense_zthr;
|
||||
if (condense_thread != NULL && zthr_isrunning(condense_thread))
|
||||
VERIFY0(zthr_cancel(condense_thread));
|
||||
}
|
||||
|
||||
void
|
||||
@ -6421,6 +6424,10 @@ spa_async_resume(spa_t *spa)
|
||||
spa->spa_async_suspended--;
|
||||
mutex_exit(&spa->spa_async_lock);
|
||||
spa_restart_removal(spa);
|
||||
|
||||
zthr_t *condense_thread = spa->spa_condense_zthr;
|
||||
if (condense_thread != NULL && !zthr_isrunning(condense_thread))
|
||||
zthr_resume(condense_thread);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include <sys/bplist.h>
|
||||
#include <sys/bpobj.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/zthr.h>
|
||||
#include <zfeature_common.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -278,7 +279,7 @@ struct spa {
|
||||
|
||||
spa_condensing_indirect_phys_t spa_condensing_indirect_phys;
|
||||
spa_condensing_indirect_t *spa_condensing_indirect;
|
||||
kthread_t *spa_condense_thread; /* thread doing condense. */
|
||||
zthr_t *spa_condense_zthr; /* zthr doing condense. */
|
||||
|
||||
char *spa_root; /* alternate root directory */
|
||||
uint64_t spa_ena; /* spa-wide ereport ENA */
|
||||
|
@ -76,7 +76,7 @@ extern int spa_remove_init(spa_t *);
|
||||
extern void spa_restart_removal(spa_t *);
|
||||
extern int spa_condense_init(spa_t *);
|
||||
extern void spa_condense_fini(spa_t *);
|
||||
extern void spa_condense_indirect_restart(spa_t *);
|
||||
extern void spa_start_indirect_condensing_thread(spa_t *);
|
||||
extern void spa_vdev_condense_suspend(spa_t *);
|
||||
extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
|
||||
extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
|
||||
|
@ -30,6 +30,8 @@
|
||||
#include <sys/dmu_tx.h>
|
||||
#include <sys/dsl_synctask.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/zthr.h>
|
||||
|
||||
/*
|
||||
* An indirect vdev corresponds to a vdev that has been removed. Since
|
||||
@ -475,7 +477,7 @@ spa_condense_indirect_commit_entry(spa_t *spa,
|
||||
|
||||
static void
|
||||
spa_condense_indirect_generate_new_mapping(vdev_t *vd,
|
||||
uint32_t *obsolete_counts, uint64_t start_index)
|
||||
uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
uint64_t mapi = start_index;
|
||||
@ -490,7 +492,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
|
||||
(u_longlong_t)vd->vdev_id,
|
||||
(u_longlong_t)mapi);
|
||||
|
||||
while (mapi < old_num_entries && !spa_shutting_down(spa)) {
|
||||
while (mapi < old_num_entries) {
|
||||
|
||||
if (zthr_iscancelled(zthr)) {
|
||||
zfs_dbgmsg("pausing condense of vdev %llu "
|
||||
"at index %llu", (u_longlong_t)vd->vdev_id,
|
||||
(u_longlong_t)mapi);
|
||||
break;
|
||||
}
|
||||
|
||||
vdev_indirect_mapping_entry_phys_t *entry =
|
||||
&old_mapping->vim_entries[mapi];
|
||||
uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
|
||||
@ -508,18 +518,30 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
|
||||
|
||||
mapi++;
|
||||
}
|
||||
if (spa_shutting_down(spa)) {
|
||||
zfs_dbgmsg("pausing condense of vdev %llu at index %llu",
|
||||
(u_longlong_t)vd->vdev_id,
|
||||
(u_longlong_t)mapi);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
spa_condense_indirect_thread(void *arg)
|
||||
/* ARGSUSED */
|
||||
static boolean_t
|
||||
spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
|
||||
{
|
||||
vdev_t *vd = arg;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
spa_t *spa = arg;
|
||||
|
||||
return (spa->spa_condensing_indirect != NULL);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
spa_condense_indirect_thread(void *arg, zthr_t *zthr)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
vdev_t *vd;
|
||||
|
||||
ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
|
||||
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
|
||||
vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
|
||||
ASSERT3P(vd, !=, NULL);
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
|
||||
spa_condensing_indirect_phys_t *scip =
|
||||
&spa->spa_condensing_indirect_phys;
|
||||
@ -593,25 +615,24 @@ spa_condense_indirect_thread(void *arg)
|
||||
}
|
||||
}
|
||||
|
||||
spa_condense_indirect_generate_new_mapping(vd, counts, start_index);
|
||||
spa_condense_indirect_generate_new_mapping(vd, counts,
|
||||
start_index, zthr);
|
||||
|
||||
vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
|
||||
|
||||
/*
|
||||
* We may have bailed early from generate_new_mapping(), if
|
||||
* the spa is shutting down. In this case, do not complete
|
||||
* the condense.
|
||||
* If the zthr has received a cancellation signal while running
|
||||
* in generate_new_mapping() or at any point after that, then bail
|
||||
* early. We don't want to complete the condense if the spa is
|
||||
* shutting down.
|
||||
*/
|
||||
if (!spa_shutting_down(spa)) {
|
||||
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
|
||||
spa_condense_indirect_complete_sync, sci, 0,
|
||||
ZFS_SPACE_CHECK_NONE));
|
||||
}
|
||||
if (zthr_iscancelled(zthr))
|
||||
return (0);
|
||||
|
||||
mutex_enter(&spa->spa_async_lock);
|
||||
spa->spa_condense_thread = NULL;
|
||||
cv_broadcast(&spa->spa_async_cv);
|
||||
mutex_exit(&spa->spa_async_lock);
|
||||
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
|
||||
spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -664,9 +685,7 @@ spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
|
||||
(u_longlong_t)scip->scip_prev_obsolete_sm_object,
|
||||
(u_longlong_t)scip->scip_next_mapping_object);
|
||||
|
||||
ASSERT3P(spa->spa_condense_thread, ==, NULL);
|
||||
spa->spa_condense_thread = thread_create(NULL, 0,
|
||||
spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri);
|
||||
zthr_wakeup(spa->spa_condense_zthr);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -743,24 +762,12 @@ spa_condense_fini(spa_t *spa)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Restart the condense - called when the pool is opened.
|
||||
*/
|
||||
void
|
||||
spa_condense_indirect_restart(spa_t *spa)
|
||||
spa_start_indirect_condensing_thread(spa_t *spa)
|
||||
{
|
||||
vdev_t *vd;
|
||||
ASSERT(spa->spa_condensing_indirect != NULL);
|
||||
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
|
||||
vd = vdev_lookup_top(spa,
|
||||
spa->spa_condensing_indirect_phys.scip_vdev);
|
||||
ASSERT(vd != NULL);
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
ASSERT3P(spa->spa_condense_thread, ==, NULL);
|
||||
spa->spa_condense_thread = thread_create(NULL, 0,
|
||||
spa_condense_indirect_thread, vd, 0, &p0, TS_RUN,
|
||||
minclsyspri);
|
||||
ASSERT3P(spa->spa_condense_zthr, ==, NULL);
|
||||
spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
|
||||
spa_condense_indirect_thread, spa);
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user