1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-17 10:26:15 +00:00

MFV r284762: 5269 zpool import slow

illumos/illumos-gate@12380e1e70

https://www.illumos.org/issues/5269
  When importing a pool (at boot or with zpool import) with many
  filesystem, the process can take minutes. It doesn't matter whether
  the pool has been exported cleanly or uncleanly.  The problem is that
  each dataset has its own log chain. On import, all datasets have to be
  checked if there are logs to replay.  The idea is to speed up this
  process by paralellizing it.

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
Author: Arne Jansen <jansen@webgods.de>
This commit is contained in:
Alexander Motin 2015-08-12 18:47:30 +00:00
commit 101a6d4eac
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=286686
10 changed files with 239 additions and 60 deletions

View File

@ -25,6 +25,7 @@
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@ -49,6 +50,7 @@
#include <sys/sa.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_destroy.h>
#include <sys/vdev.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
@ -56,6 +58,16 @@
*/
krwlock_t os_lock;
/*
* Tunable to overwrite the maximum number of threads for the parallization
* of dmu_objset_find_dp, needed to speed up the import of pools with many
* datasets.
* Default is 4 times the number of leaf vdevs.
*/
int dmu_find_threads = 0;
static void dmu_objset_find_dp_cb(void *arg);
void
dmu_objset_init(void)
{
@ -504,6 +516,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
return (err);
}
static int
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
int err;
err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
}
/*
* dsl_pool must not be held when this is called.
* Upon successful return, there will be a longhold on the dataset,
@ -525,21 +556,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_objset_from_ds(ds, osp);
err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
dsl_pool_rele(dp, FTAG);
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && ds->ds_is_snapshot) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
}
int
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
dsl_dataset_t *ds;
int err;
err = dsl_dataset_own_obj(dp, obj, tag, &ds);
if (err != 0)
return (err);
return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
}
void
dmu_objset_rele(objset_t *os, void *tag)
{
@ -1580,30 +1616,41 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
return (0);
}
/*
* Find objsets under and including ddobj, call func(ds) on each.
*/
int
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
typedef struct dmu_objset_find_ctx {
taskq_t *dc_tq;
dsl_pool_t *dc_dp;
uint64_t dc_ddobj;
int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
void *dc_arg;
int dc_flags;
kmutex_t *dc_error_lock;
int *dc_error;
} dmu_objset_find_ctx_t;
static void
dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
{
dsl_pool_t *dp = dcp->dc_dp;
dmu_objset_find_ctx_t *child_dcp;
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
int err;
int err = 0;
ASSERT(dsl_pool_config_held(dp));
/* don't process if there already was an error */
if (*dcp->dc_error != 0)
goto out;
err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
if (err != 0)
return (err);
goto out;
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
return (0);
goto out;
}
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
@ -1612,7 +1659,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
/*
* Iterate over all children.
*/
if (flags & DS_FIND_CHILDREN) {
if (dcp->dc_flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
@ -1621,24 +1668,22 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
err = dmu_objset_find_dp(dp, attr->za_first_integer,
func, arg, flags);
if (err != 0)
break;
child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
*child_dcp = *dcp;
child_dcp->dc_ddobj = attr->za_first_integer;
if (dcp->dc_tq != NULL)
(void) taskq_dispatch(dcp->dc_tq,
dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
else
dmu_objset_find_dp_impl(child_dcp);
}
zap_cursor_fini(&zc);
if (err != 0) {
dsl_dir_rele(dd, FTAG);
kmem_free(attr, sizeof (zap_attribute_t));
return (err);
}
}
/*
* Iterate over all snapshots.
*/
if (flags & DS_FIND_SNAPSHOTS) {
if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
@ -1659,7 +1704,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
err = func(dp, ds, arg);
err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
@ -1672,17 +1717,115 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
kmem_free(attr, sizeof (zap_attribute_t));
if (err != 0)
return (err);
goto out;
/*
* Apply to self.
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err != 0)
return (err);
err = func(dp, ds, arg);
goto out;
err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
return (err);
out:
if (err != 0) {
mutex_enter(dcp->dc_error_lock);
/* only keep first error */
if (*dcp->dc_error == 0)
*dcp->dc_error = err;
mutex_exit(dcp->dc_error_lock);
}
kmem_free(dcp, sizeof (*dcp));
}
static void
dmu_objset_find_dp_cb(void *arg)
{
dmu_objset_find_ctx_t *dcp = arg;
dsl_pool_t *dp = dcp->dc_dp;
dsl_pool_config_enter(dp, FTAG);
dmu_objset_find_dp_impl(dcp);
dsl_pool_config_exit(dp, FTAG);
}
/*
* Find objsets under and including ddobj, call func(ds) on each.
* The order for the enumeration is completely undefined.
* func is called with dsl_pool_config held.
*/
int
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
{
int error = 0;
taskq_t *tq = NULL;
int ntasks;
dmu_objset_find_ctx_t *dcp;
kmutex_t err_lock;
mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
dcp->dc_tq = NULL;
dcp->dc_dp = dp;
dcp->dc_ddobj = ddobj;
dcp->dc_func = func;
dcp->dc_arg = arg;
dcp->dc_flags = flags;
dcp->dc_error_lock = &err_lock;
dcp->dc_error = &error;
if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
/*
* In case a write lock is held we can't make use of
* parallelism, as down the stack of the worker threads
* the lock is asserted via dsl_pool_config_held.
* In case of a read lock this is solved by getting a read
* lock in each worker thread, which isn't possible in case
* of a writer lock. So we fall back to the synchronous path
* here.
* In the future it might be possible to get some magic into
* dsl_pool_config_held in a way that it returns true for
* the worker threads so that a single lock held from this
* thread suffices. For now, stay single threaded.
*/
dmu_objset_find_dp_impl(dcp);
return (error);
}
ntasks = dmu_find_threads;
if (ntasks == 0)
ntasks = vdev_count_leaves(dp->dp_spa) * 4;
tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
INT_MAX, 0);
if (tq == NULL) {
kmem_free(dcp, sizeof (*dcp));
return (SET_ERROR(ENOMEM));
}
dcp->dc_tq = tq;
/* dcp will be freed by task */
(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
/*
* PORTING: this code relies on the property of taskq_wait to wait
* until no more tasks are queued and no more tasks are active. As
* we always queue new tasks from within other tasks, task_wait
* reliably waits for the full recursion to finish, even though we
* enqueue new tasks after taskq_wait has been called.
* On platforms other than illumos, taskq_wait may not have this
* property.
*/
taskq_wait(tq);
taskq_destroy(tq);
mutex_destroy(&err_lock);
return (error);
}
/*

View File

@ -851,7 +851,7 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
ASSERT(dp->dp_origin_snap != NULL);
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
tx, DS_FIND_CHILDREN));
tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
/* ARGSUSED */
@ -905,7 +905,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
void
@ -1149,3 +1149,9 @@ dsl_pool_config_held(dsl_pool_t *dp)
{
return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
}
boolean_t
dsl_pool_config_held_writer(dsl_pool_t *dp)
{
return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
}

View File

@ -1792,13 +1792,14 @@ static boolean_t
spa_check_logs(spa_t *spa)
{
boolean_t rv = B_FALSE;
dsl_pool_t *dp = spa_get_dsl(spa);
switch (spa->spa_log_state) {
case SPA_LOG_MISSING:
/* need to recheck in case slog has been restored */
case SPA_LOG_UNKNOWN:
rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
NULL, DS_FIND_CHILDREN) != 0);
rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
if (rv)
spa_set_log_state(spa, SPA_LOG_MISSING);
break;
@ -2786,6 +2787,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
dsl_pool_t *dp = spa_get_dsl(spa);
ASSERT(state != SPA_LOAD_TRYIMPORT);
@ -2798,9 +2800,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
*/
spa->spa_claiming = B_TRUE;
tx = dmu_tx_create_assigned(spa_get_dsl(spa),
spa_first_txg(spa));
(void) dmu_objset_find(spa_name(spa),
tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);

View File

@ -243,6 +243,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DS_FIND_SNAPSHOTS (1<<0)
#define DS_FIND_CHILDREN (1<<1)
#define DS_FIND_SERIALIZE (1<<2)
/*
* The maximum number of bytes that can be accessed as part of one

View File

@ -142,6 +142,8 @@ struct objset {
int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
int dmu_objset_own(const char *name, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp);
int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
void dmu_objset_refresh_ownership(objset_t *os, void *tag);
void dmu_objset_rele(objset_t *os, void *tag);
void dmu_objset_disown(objset_t *os, void *tag);

View File

@ -154,6 +154,7 @@ void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
boolean_t dsl_pool_config_held(dsl_pool_t *dp);
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);

View File

@ -61,6 +61,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
extern boolean_t vdev_is_bootable(vdev_t *vd);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
extern int vdev_count_leaves(spa_t *spa);
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,

View File

@ -37,6 +37,9 @@
extern "C" {
#endif
struct dsl_pool;
struct dsl_dataset;
/*
* Intent log format:
*
@ -404,8 +407,10 @@ extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_commit(zilog_t *zilog, uint64_t oid);
extern int zil_vdev_offline(const char *osname, void *txarg);
extern int zil_claim(const char *osname, void *txarg);
extern int zil_check_log_chain(const char *osname, void *txarg);
extern int zil_claim(struct dsl_pool *dp,
struct dsl_dataset *ds, void *txarg);
extern int zil_check_log_chain(struct dsl_pool *dp,
struct dsl_dataset *ds, void *tx);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);

View File

@ -272,6 +272,26 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
return (NULL);
}
static int
vdev_count_leaves_impl(vdev_t *vd)
{
int n = 0;
if (vd->vdev_ops->vdev_op_leaf)
return (1);
for (int c = 0; c < vd->vdev_children; c++)
n += vdev_count_leaves_impl(vd->vdev_child[c]);
return (n);
}
int
vdev_count_leaves(spa_t *spa)
{
return (vdev_count_leaves_impl(spa->spa_root_vdev));
}
void
vdev_add_child(vdev_t *pvd, vdev_t *cvd)
{

View File

@ -636,7 +636,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
}
int
zil_claim(const char *osname, void *txarg)
zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
{
dmu_tx_t *tx = txarg;
uint64_t first_txg = dmu_tx_get_txg(tx);
@ -645,15 +645,16 @@ zil_claim(const char *osname, void *txarg)
objset_t *os;
int error;
error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
error = dmu_objset_own_obj(dp, ds->ds_object,
DMU_OST_ANY, B_FALSE, FTAG, &os);
if (error != 0) {
/*
* EBUSY indicates that the objset is inconsistent, in which
* case it can not have a ZIL.
*/
if (error != EBUSY) {
cmn_err(CE_WARN, "can't open objset for %s, error %u",
osname, error);
cmn_err(CE_WARN, "can't open objset for %llu, error %u",
(unsigned long long)ds->ds_object, error);
}
return (0);
}
@ -700,8 +701,9 @@ zil_claim(const char *osname, void *txarg)
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
/* ARGSUSED */
int
zil_check_log_chain(const char *osname, void *tx)
zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
{
zilog_t *zilog;
objset_t *os;
@ -710,9 +712,10 @@ zil_check_log_chain(const char *osname, void *tx)
ASSERT(tx == NULL);
error = dmu_objset_hold(osname, FTAG, &os);
error = dmu_objset_from_ds(ds, &os);
if (error != 0) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
cmn_err(CE_WARN, "can't open objset %llu, error %d",
(unsigned long long)ds->ds_object, error);
return (0);
}
@ -735,10 +738,8 @@ zil_check_log_chain(const char *osname, void *tx)
valid = vdev_log_state_valid(vd);
spa_config_exit(os->os_spa, SCL_STATE, FTAG);
if (!valid) {
dmu_objset_rele(os, FTAG);
if (!valid)
return (0);
}
}
/*
@ -751,8 +752,6 @@ zil_check_log_chain(const char *osname, void *tx)
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
dmu_objset_rele(os, FTAG);
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}