1
0
mirror of https://git.FreeBSD.org/src.git synced 2025-01-20 15:43:16 +00:00

Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol

4390 i/o errors when deleting filesystem/zvol can lead to space map corruption
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/4390
  https://github.com/illumos/illumos-gate/commit/7fd05ac

Porting notes:

Previous stack-reduction efforts in traverse_visitb() caused a fair
number of un-mergable pieces of code.  This patch should reduce its
stack footprint a bit more.

The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated
using kmem_zalloc() for the purpose of stack reduction.

The new global zfs_free_leak_on_eio has been defined as an integer
rather than a boolean_t as was the case with the related zfs_recover
global.  Also, zfs_free_leak_on_eio's definition has been inserted into
zfs_debug.c for consistency with the existing definition of zfs_recover.
Illumos placed it in spa_misc.c.

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2545
This commit is contained in:
Matthew Ahrens 2014-06-05 13:20:08 -08:00 committed by Brian Behlendorf
parent 9b67f60560
commit fbeddd60b7
17 changed files with 339 additions and 157 deletions

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPTREE_H
@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);

View File

@ -250,7 +250,6 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
#define DMU_DEADLIST_OBJECT (-3ULL)
/*
* artificial blkids for bonus buffer and spill blocks

View File

@ -144,6 +144,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
#define ORIGIN_DIR_NAME "$ORIGIN"
#define XLATION_DIR_NAME "$XLATION"
#define FREE_DIR_NAME "$FREE"
#define LEAK_DIR_NAME "$LEAK"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \

View File

@ -87,6 +87,7 @@ typedef struct dsl_pool {
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
struct dsl_dir *dp_free_dir;
struct dsl_dir *dp_leak_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;

View File

@ -116,6 +116,7 @@ typedef struct dsl_scan {
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;
/* for debugging / information */
uint64_t scn_visited_this_txg;

View File

@ -193,6 +193,7 @@ typedef enum {
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
ZPOOL_PROP_LEAKED,
ZPOOL_NUM_PROPS
} zpool_prop_t;

View File

@ -48,6 +48,7 @@ extern "C" {
extern int zfs_flags;
extern int zfs_recover;
extern int zfs_free_leak_on_eio;
#define ZFS_DEBUG_DPRINTF (1<<0)
#define ZFS_DEBUG_DBUF_VERIFY (1<<1)

View File

@ -316,6 +316,7 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
case ZPOOL_PROP_FREEING:
case ZPOOL_PROP_LEAKED:
case ZPOOL_PROP_EXPANDSZ:
case ZPOOL_PROP_ASHIFT:
if (literal)

View File

@ -696,6 +696,43 @@ Set additional debugging flags
Default value: \fB1\fR.
.RE
.sp
.ne 2
.na
\fBzfs_free_leak_on_eio\fR (int)
.ad
.RS 12n
If destroy encounters an EIO while reading metadata (e.g. indirect
blocks), space referenced by the missing metadata can not be freed.
Normally this causes the background destroy to become "stalled", as
it is unable to make forward progress. While in this stalled state,
all remaining space to free from the error-encountering filesystem is
"temporarily leaked". Set this flag to cause it to ignore the EIO,
permanently leak the space from indirect blocks that can not be read,
and continue to free everything else that it can.
The default, "stalling" behavior is useful if the storage partially
fails (i.e. some but not all i/os fail), and then later recovers. In
this case, we will be able to continue pool operations while it is
partially failed, and when it recovers, we can continue to free the
space, with no leaks. However, note that this case is actually
fairly rare.
Typically pools either (a) fail completely (but perhaps temporarily,
e.g. a top-level vdev going offline), or (b) have localized,
permanent errors (e.g. disk returns the wrong data due to bit flip or
firmware bug). In case (a), this setting does not matter because the
pool will be suspended and the sync thread will not be able to make
forward progress regardless. In case (b), because the error is
permanent, the best we can do is leak the minimum amount of space,
which is what setting this flag will do. Therefore, it is reasonable
for this flag to normally be set, but we chose the more conservative
approach of not setting it, so that there is no possibility of
leaking space in the "partial temporary" failure case.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na

View File

@ -81,6 +81,8 @@ zpool_prop_init(void)
ZFS_TYPE_POOL, "<size>", "FREE");
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "FREEING");
zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "LEAKED");
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,

View File

@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
return (dmu_object_free(os, obj, tx));
}
boolean_t
bptree_is_empty(objset_t *os, uint64_t obj)
{
dmu_buf_t *db;
bptree_phys_t *bt;
boolean_t rv;
VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;
rv = (bt->bt_begin == bt->bt_end);
dmu_buf_rele(db, FTAG);
return (rv);
}
void
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
{
dmu_buf_t *db;
bptree_phys_t *bt;
bptree_entry_phys_t bte;
bptree_entry_phys_t *bte;
/*
* bptree objects are in the pool mos, therefore they can only be
@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;
bte.be_birth_txg = birth_txg;
bte.be_bp = *bp;
bzero(&bte.be_zb, sizeof (bte.be_zb));
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
bte->be_birth_txg = birth_txg;
bte->be_bp = *bp;
dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
kmem_free(bte, sizeof (*bte));
dmu_buf_will_dirty(db, tx);
bt->bt_end++;
@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (err);
}
/*
* If "free" is set:
* - It is assumed that "func" will be freeing the block pointers.
* - If "func" returns nonzero, the bookmark will be remembered and
* iteration will be restarted from this point on next invocation.
* - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
* bptree_iterate will remember the bookmark, continue traversing
* any additional entries, and return 0.
*
* If "free" is not set, traversal will stop and return an error if
* an i/o error is encountered.
*
* In either case, if zfs_free_leak_on_eio is set, i/o errors will be
* ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
* traverse_dataset_destroyed()).
*/
int
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
void *arg, dmu_tx_t *tx)
{
boolean_t ioerr = B_FALSE;
int err;
uint64_t i;
dmu_buf_t *db;
@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
ASSERT(!free || i == ba.ba_phys->bt_begin);
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
if (err != 0)
break;
if (zfs_recover)
if (zfs_free_leak_on_eio)
flags |= TRAVERSE_HARD;
zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
"bookmark %lld/%lld/%lld/%lld",
i, (longlong_t)bte.be_birth_txg,
(longlong_t)bte.be_zb.zb_objset,
(longlong_t)bte.be_zb.zb_object,
(longlong_t)bte.be_zb.zb_level,
(longlong_t)bte.be_zb.zb_blkid);
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
bte.be_birth_txg, &bte.be_zb, flags,
bptree_visit_cb, &ba);
if (free) {
if (err == ERESTART) {
/*
* The callback has freed the visited block pointers.
* Record our traversal progress on disk, either by
* updating this record's bookmark, or by logically
* removing this record by advancing bt_begin.
*/
if (err != 0) {
/* save bookmark for future resume */
ASSERT3U(bte.be_zb.zb_objset, ==,
ZB_DESTROYED_OBJSET);
ASSERT0(bte.be_zb.zb_level);
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
if (err == EIO || err == ECKSUM ||
err == ENXIO) {
/*
* Skip the rest of this tree and
* continue on to the next entry.
*/
err = 0;
ioerr = B_TRUE;
} else {
break;
}
if (err != 0) {
} else if (ioerr) {
/*
* We can not properly handle an i/o
* error, because the traversal code
* does not know how to resume from an
* arbitrary bookmark.
* This entry is finished, but there were
* i/o errors on previous entries, so we
* can't adjust bt_begin. Set this entry's
* be_birth_txg such that it will be
* treated as a no-op in future traversals.
*/
zfs_panic_recover("error %u from "
"traverse_dataset_destroyed()", err);
bte.be_birth_txg = UINT64_MAX;
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
}
if (!ioerr) {
ba.ba_phys->bt_begin++;
(void) dmu_free_range(os, obj,
i * sizeof (bte), sizeof (bte), tx);
}
} else if (err != 0) {
break;
}
}
ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
ASSERT(!free || err != 0 || ioerr ||
ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
/* if all blocks are free there should be no used space */
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
if (zfs_free_leak_on_eio) {
ba.ba_phys->bt_bytes = 0;
ba.ba_phys->bt_comp = 0;
ba.ba_phys->bt_uncomp = 0;
}
ASSERT0(ba.ba_phys->bt_bytes);
ASSERT0(ba.ba_phys->bt_comp);
ASSERT0(ba.ba_phys->bt_uncomp);

View File

@ -58,12 +58,11 @@ typedef struct traverse_data {
zbookmark_t *td_resume;
int td_flags;
prefetch_data_t *td_pfd;
boolean_t td_paused;
blkptr_cb_t *td_func;
void *td_arg;
} traverse_data_t;
#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD)
static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object);
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
@ -165,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* If we found the block we're trying to resume from, zero
* the bookmark out to indicate that we have resumed.
*/
ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
bzero(td->td_resume, sizeof (*zb));
if (td->td_flags & TRAVERSE_POST)
@ -175,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
return (RESUME_SKIP_NONE);
}
static void
traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
{
ASSERT(td->td_resume != NULL);
ASSERT0(zb->zb_level);
bcopy(zb, td->td_resume, sizeof (*td->td_resume));
}
static void
traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_t *zb)
@ -211,9 +201,8 @@ static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
const blkptr_t *bp, const zbookmark_t *zb)
{
int err = 0, lasterr = 0;
int err = 0;
arc_buf_t *buf = NULL;
boolean_t pause = B_FALSE;
switch (resume_skip_check(td, dnp, zb)) {
case RESUME_SKIP_ALL:
@ -252,7 +241,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
if (BP_IS_HOLE(bp)) {
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
return (err);
if (err != 0)
goto post;
return (0);
}
if (td->td_pfd && !td->td_pfd->pd_exited &&
@ -273,8 +264,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
if (err == ERESTART)
pause = B_TRUE; /* handle pausing at a common point */
if (err != 0)
goto post;
}
@ -288,7 +277,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
goto post;
czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);
@ -307,11 +296,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp,
&((blkptr_t *)buf->b_data)[i], czb);
if (err != 0) {
if (!TD_HARD(td))
if (err != 0)
break;
lasterr = err;
}
}
kmem_free(czb, sizeof (zbookmark_t));
@ -324,7 +310,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
goto post;
dnp = buf->b_data;
for (i = 0; i < epb; i++) {
@ -336,11 +322,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
for (i = 0; i < epb; i++) {
err = traverse_dnode(td, &dnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
if (err != 0) {
if (!TD_HARD(td))
if (err != 0)
break;
lasterr = err;
}
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
@ -350,7 +333,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
goto post;
osp = buf->b_data;
dnp = &osp->os_meta_dnode;
@ -365,19 +348,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (err && TD_HARD(td)) {
lasterr = err;
err = 0;
}
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_groupused_dnode;
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_GROUPUSED_OBJECT);
}
if (err && TD_HARD(td)) {
lasterr = err;
err = 0;
}
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_userused_dnode;
err = traverse_dnode(td, dnp, zb->zb_objset,
@ -389,19 +364,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
(void) arc_buf_remove_ref(buf, &buf);
post:
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
if (err == 0 && (td->td_flags & TRAVERSE_POST))
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
if (err == ERESTART)
pause = B_TRUE;
if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
/*
* Ignore this disk error as requested by the HARD flag,
* and continue traversal.
*/
err = 0;
}
if (pause && td->td_resume != NULL) {
ASSERT3U(err, ==, ERESTART);
ASSERT(!TD_HARD(td));
traverse_pause(td, zb);
/*
* If we are stopping here, set td_resume.
*/
if (td->td_resume != NULL && err != 0 && !td->td_paused) {
td->td_resume->zb_objset = zb->zb_objset;
td->td_resume->zb_object = zb->zb_object;
td->td_resume->zb_level = 0;
/*
* If we have stopped on an indirect block (e.g. due to
* i/o error), we have not visited anything below it.
* Set the bookmark to the first level-0 block that we need
* to visit. This way, the resuming code does not need to
* deal with resuming from indirect blocks.
*/
td->td_resume->zb_blkid = zb->zb_blkid <<
(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
td->td_paused = B_TRUE;
}
return (err != 0 ? err : lasterr);
return (err);
}
static void
@ -426,29 +419,21 @@ static int
traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object)
{
int j, err = 0, lasterr = 0;
int j, err = 0;
zbookmark_t czb;
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
if (err != 0) {
if (!TD_HARD(td))
if (err != 0)
break;
lasterr = err;
}
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
if (err != 0) {
if (!TD_HARD(td))
}
return (err);
lasterr = err;
}
}
return (err != 0 ? err : lasterr);
}
/* ARGSUSED */
@ -539,6 +524,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td->td_arg = arg;
td->td_pfd = pd;
td->td_flags = flags;
td->td_paused = B_FALSE;
pd->pd_blks_max = zfs_pd_blks_max;
pd->pd_flags = flags;
@ -617,7 +603,7 @@ int
traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
int err, lasterr = 0;
int err;
uint64_t obj;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
@ -630,16 +616,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
return (err);
/* visit each dataset */
for (obj = 1; err == 0 || (err != ESRCH && hard);
for (obj = 1; err == 0;
err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
if (err != 0) {
if (!hard)
return (err);
lasterr = err;
if (hard)
continue;
break;
}
if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
@ -650,25 +635,21 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
dsl_pool_config_exit(dp, FTAG);
if (err != 0) {
if (!hard)
return (err);
lasterr = err;
if (hard)
continue;
break;
}
if (ds->ds_phys->ds_prev_snap_txg > txg)
txg = ds->ds_phys->ds_prev_snap_txg;
err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0) {
if (!hard)
return (err);
lasterr = err;
}
if (err != 0)
break;
}
}
if (err == ESRCH)
err = 0;
return (err != 0 ? err : lasterr);
return (err);
}
#if defined(_KERNEL) && defined(HAVE_SPL)

View File

@ -245,6 +245,13 @@ dsl_pool_open(dsl_pool_t *dp)
dp->dp_meta_objset, obj));
}
/*
* Note: errors ignored, because the leak dir will not exist if we
* have not encountered a leak yet.
*/
(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
&dp->dp_leak_dir);
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
@ -292,6 +299,8 @@ dsl_pool_close(dsl_pool_t *dp)
dsl_dir_rele(dp->dp_mos_dir, dp);
if (dp->dp_free_dir)
dsl_dir_rele(dp->dp_free_dir, dp);
if (dp->dp_leak_dir)
dsl_dir_rele(dp->dp_leak_dir, dp);
if (dp->dp_root_dir)
dsl_dir_rele(dp->dp_root_dir, dp);

View File

@ -65,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
@ -1417,7 +1417,7 @@ dsl_scan_active(dsl_scan_t *scn)
if (spa_shutting_down(spa))
return (B_FALSE);
if (scn->scn_phys.scn_state == DSS_SCANNING ||
scn->scn_async_destroying)
(scn->scn_async_destroying && !scn->scn_async_stalled))
return (B_TRUE);
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@ -1432,7 +1432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
int err;
int err = 0;
/*
* Check for scn_restart_txg before checking spa_load_state, so
@ -1450,7 +1450,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_scan_setup_sync(&func, tx);
}
if (!dsl_scan_active(scn) ||
/*
* If the scan is inactive due to a stalled async destroy, try again.
*/
if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
spa_sync_pass(dp->dp_spa) > 1)
return;
@ -1460,10 +1463,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
spa->spa_scrub_active = B_TRUE;
/*
* First process the free list. If we pause the free, don't do
* any scanning. This ensures that there is no free list when
* we are scanning, so the scan code doesn't have to worry about
* traversing it.
* First process the async destroys. If we pause, don't do
* any scrubbing or resilvering. This ensures that there are no
* async destroys while we are scanning, so the scan code doesn't
* have to worry about traversing it. It is also faster to free the
* blocks than to scrub them.
*/
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
@ -1473,21 +1477,36 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_scan_free_block_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
if (err == 0 && spa_feature_is_active(spa,
SPA_FEATURE_ASYNC_DESTROY)) {
if (err != 0 && err != ERESTART)
zfs_panic_recover("error %u from bpobj_iterate()", err);
}
if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
ASSERT(scn->scn_async_destroying);
scn->scn_is_bptree = B_TRUE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bptree_iterate(dp->dp_meta_objset,
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
scn, tx);
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
VERIFY0(zio_wait(scn->scn_zio_root));
if (err == 0) {
if (err == EIO || err == ECKSUM) {
err = 0;
} else if (err != 0 && err != ERESTART) {
zfs_panic_recover("error %u from "
"traverse_dataset_destroyed()", err);
}
/*
* If we didn't make progress, mark the async destroy as
* stalled, so that we will not initiate a spa_sync() on
* its behalf.
*/
scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
/* finished; deactivate async destroy feature */
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
tx);
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
ASSERT(!spa_feature_is_active(spa,
SPA_FEATURE_ASYNC_DESTROY));
VERIFY0(zap_remove(dp->dp_meta_objset,
@ -1501,20 +1520,49 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
if (scn->scn_visited_this_txg) {
zfs_dbgmsg("freed %llu blocks in %llums from "
"free_bpobj/bptree txg %llu",
"free_bpobj/bptree txg %llu; err=%u",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
(longlong_t)tx->tx_txg);
(longlong_t)tx->tx_txg, err);
scn->scn_visited_this_txg = 0;
/*
* Re-sync the ddt so that we can further modify
* it when doing bprewrite.
* Write out changes to the DDT that may be required as a
* result of the blocks freed. This ensures that the DDT
* is clean when a scrub/resilver runs.
*/
ddt_sync(spa, tx->tx_txg);
}
if (err == ERESTART)
if (err != 0)
return;
if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
(dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
/*
* We have finished background destroying, but there is still
* some space left in the dp_free_dir. Transfer this leaked
* space to the dp_leak_dir.
*/
if (dp->dp_leak_dir == NULL) {
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
LEAK_DIR_NAME, tx);
VERIFY0(dsl_pool_open_special_dir(dp,
LEAK_DIR_NAME, &dp->dp_leak_dir));
rrw_exit(&dp->dp_config_rwlock, FTAG);
}
dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
dp->dp_free_dir->dd_phys->dd_used_bytes,
dp->dp_free_dir->dd_phys->dd_compressed_bytes,
dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-dp->dp_free_dir->dd_phys->dd_used_bytes,
-dp->dp_free_dir->dd_phys->dd_compressed_bytes,
-dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
}
if (!scn->scn_async_destroying) {
/* finished; verify that space accounting went to zero */
ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);

View File

@ -238,19 +238,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
}
if (pool != NULL) {
dsl_dir_t *freedir = pool->dp_free_dir;
/*
* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
* when opening pools before this version freedir will be NULL.
*/
if (freedir != NULL) {
if (pool->dp_free_dir != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
freedir->dd_phys->dd_used_bytes, src);
pool->dp_free_dir->dd_phys->dd_used_bytes, src);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
NULL, 0, src);
}
if (pool->dp_leak_dir != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
NULL, 0, src);
}
}
spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);

View File

@ -29,7 +29,7 @@
list_t zfs_dbgmsgs;
int zfs_dbgmsg_size;
kmutex_t zfs_dbgmsgs_lock;
int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
#endif
/*
@ -44,7 +44,38 @@ int zfs_flags = 0;
* This should only be used as a last resort, as it typically results
* in leaked space, or worse.
*/
int zfs_recover = 0;
int zfs_recover = B_FALSE;
/*
* If destroy encounters an EIO while reading metadata (e.g. indirect
* blocks), space referenced by the missing metadata can not be freed.
* Normally this causes the background destroy to become "stalled", as
* it is unable to make forward progress. While in this stalled state,
* all remaining space to free from the error-encountering filesystem is
* "temporarily leaked". Set this flag to cause it to ignore the EIO,
* permanently leak the space from indirect blocks that can not be read,
* and continue to free everything else that it can.
*
* The default, "stalling" behavior is useful if the storage partially
* fails (i.e. some but not all i/os fail), and then later recovers. In
* this case, we will be able to continue pool operations while it is
* partially failed, and when it recovers, we can continue to free the
* space, with no leaks. However, note that this case is actually
* fairly rare.
*
* Typically pools either (a) fail completely (but perhaps temporarily,
* e.g. a top-level vdev going offline), or (b) have localized,
* permanent errors (e.g. disk returns the wrong data due to bit flip or
* firmware bug). In case (a), this setting does not matter because the
* pool will be suspended and the sync thread will not be able to make
* forward progress regardless. In case (b), because the error is
* permanent, the best we can do is leak the minimum amount of space,
* which is what setting this flag will do. Therefore, it is reasonable
* for this flag to normally be set, but we chose the more conservative
* approach of not setting it, so that there is no possibility of
* leaking space in the "partial temporary" failure case.
*/
int zfs_free_leak_on_eio = B_FALSE;
void
@ -163,4 +194,8 @@ MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags");
module_param(zfs_recover, int, 0644);
MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors");
module_param(zfs_free_leak_on_eio, int, 0644);
MODULE_PARM_DESC(zfs_free_leak_on_eio,
"Set to ignore IO errors during free and permanently leak the space");
#endif /* _KERNEL */

View File

@ -3356,13 +3356,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
ASSERT(zb1->zb_objset == zb2->zb_objset);
ASSERT(zb2->zb_level == 0);
/*
* A bookmark in the deadlist is considered to be after
* everything else.
*/
if (zb2->zb_object == DMU_DEADLIST_OBJECT)
return (B_TRUE);
/* The objset_phys_t isn't before anything. */
if (dnp == NULL)
return (B_FALSE);