1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-16 10:20:30 +00:00

MFV r294814: 6393 zfs receive a full send as a clone

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Dan McDonald <danmcd@omniti.com>
Author: Paul Dagnelie <pcd@delphix.com>

illumos/illumos-gate@68ecb2ec93

This allows to do a full (non-incremental send) and receive it as a clone
of an existing dataset. It can leverage nopwrite to share blocks with the
origin. This can be used to change the relationship of datasets on the
target. For example, maybe on the source you have:

A ---- B ---- C

And you have sent to the target a full of B, and the incremental B->C:

B ---- C

You later realize that you want to have A on the target. You will have to
do a full send of A, but nopwrite can save you space on the target if you
receive it as a clone of B, assuming that A and B have some blocks inxi
common:

B ---- C
 \
  A
This commit is contained in:
Alexander Motin 2016-01-26 13:14:39 +00:00
commit 75b810aee6
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=294815
4 changed files with 122 additions and 58 deletions

View File

@ -2841,8 +2841,11 @@ Do not actually receive the stream. This can be useful in conjunction with the
option to verify the name the receive operation would use.
.It Fl o Sy origin Ns = Ns Ar snapshot
Forces the stream to be received as a clone of the given snapshot.
This is only valid if the stream is an incremental stream whose source
is the same as the provided origin.
If the stream is a full send stream, this will create the filesystem
described by the stream as a clone of the specified snapshot. Which
snapshot was specified will not affect the success or failure of the
receive, as long as the snapshot does exist. If the stream is an
incremental send stream, all the normal verification will be performed.
.It Fl F
Force a rollback of the file system to the most recent snapshot before
performing the receive operation. If receiving an incremental replication

View File

@ -158,6 +158,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
return (0);
}
/*
* Fill in the drr_free struct, or perform aggregation if the previous record is
* also a free record, and the two are adjacent.
*
* Note that we send free records even for a full send, because we want to be
* able to receive a full send as a clone, which requires a list of all the free
* and freeobject records that were generated on the source.
*/
static int
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
uint64_t length)
@ -181,15 +189,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
(object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset));
/*
* If we are doing a non-incremental send, then there can't
* be any data in the dataset we're receiving into. Therefore
* a free record would simply be a no-op. Save space by not
* sending it to begin with.
*/
if (!dsp->dsa_incremental)
return (0);
if (length != -1ULL && offset + length < offset)
length = -1ULL;
@ -368,10 +367,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
{
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
/* See comment in dump_free(). */
if (!dsp->dsa_incremental)
return (0);
/*
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for
@ -776,6 +771,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
if (ancestor_zb != NULL) {
drr->drr_u.drr_begin.drr_fromguid =
@ -799,7 +795,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_off = off;
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags;
dsp->dsa_resume_object = resumeobj;
dsp->dsa_resume_offset = resumeoff;
@ -1321,7 +1316,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */
if (flags & DRR_FLAG_CLONE) {
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
@ -1340,6 +1335,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
drba->drba_origin))
return (SET_ERROR(ENOENT));
/*
* If we're receiving a full send as a clone, and it doesn't
* contain all the necessary free records and freeobject
* records, reject it.
*/
if (fromguid == 0 && drba->drba_origin &&
!(flags & DRR_FLAG_FREERECORDS))
return (SET_ERROR(EINVAL));
/* Open the parent of tofs */
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@ -1379,7 +1383,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
fromguid != 0) {
dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENODEV));
@ -1709,6 +1714,20 @@ struct receive_writer_arg {
uint64_t bytes_read; /* bytes read when current record created */
};
struct objlist {
list_t list; /* List of struct receive_objnode. */
/*
* Last object looked up. Used to assert that objects are being looked
* up in ascending order.
*/
uint64_t last_lookup;
};
struct receive_objnode {
list_node_t node;
uint64_t object;
};
struct receive_arg {
objset_t *os;
kthread_t *td;
@ -1727,12 +1746,7 @@ struct receive_arg {
int err;
boolean_t byteswap;
/* Sorted list of objects not to issue prefetches for. */
list_t ignore_obj_list;
};
struct receive_ign_obj_node {
list_node_t node;
uint64_t object;
struct objlist ignore_objlist;
};
typedef struct guid_map_entry {
@ -2068,13 +2082,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
{
uint64_t obj;
int next_err = 0;
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
for (obj = drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
if (dmu_object_info(rwa->os, obj, NULL) != 0)
@ -2084,7 +2099,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
}
if (next_err != ESRCH)
return (next_err);
return (0);
}
@ -2414,6 +2430,66 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
return (0);
}
static void
objlist_create(struct objlist *list)
{
list_create(&list->list, sizeof (struct receive_objnode),
offsetof(struct receive_objnode, node));
list->last_lookup = 0;
}
static void
objlist_destroy(struct objlist *list)
{
for (struct receive_objnode *n = list_remove_head(&list->list);
n != NULL; n = list_remove_head(&list->list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&list->list);
}
/*
* This function looks through the objlist to see if the specified object number
* is contained in the objlist. In the process, it will remove all object
* numbers in the list that are smaller than the specified object number. Thus,
* any lookup of an object number smaller than a previously looked up object
* number will always return false; therefore, all lookups should be done in
* ascending order.
*/
static boolean_t
objlist_exists(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = list_head(&list->list);
ASSERT3U(object, >=, list->last_lookup);
list->last_lookup = object;
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&list->list));
kmem_free(node, sizeof (*node));
node = list_head(&list->list);
}
return (node != NULL && node->object == object);
}
/*
* The objlist is a list of object numbers stored in ascending order. However,
* the insertion of new object numbers does not seek out the correct location to
* store a new object number; instead, it appends it to the list for simplicity.
* Thus, any users must take care to only insert new object numbers in ascending
* order.
*/
static void
objlist_insert(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
node->object = object;
#ifdef ZFS_DEBUG
struct receive_objnode *last_object = list_tail(&list->list);
uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
#endif
list_insert_tail(&list->list, node);
}
/*
* Issue the prefetch reads for any necessary indirect blocks.
*
@ -2436,13 +2512,7 @@ static void
receive_read_prefetch(struct receive_arg *ra,
uint64_t object, uint64_t offset, uint64_t length)
{
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
kmem_free(node, sizeof (*node));
node = list_head(&ra->ignore_obj_list);
}
if (node == NULL || node->object > object) {
if (!objlist_exists(&ra->ignore_objlist, object)) {
dmu_prefetch(ra->os, object, 1, offset, length,
ZIO_PRIORITY_SYNC_READ);
}
@ -2475,18 +2545,7 @@ receive_read_record(struct receive_arg *ra)
*/
if (err == ENOENT ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
struct receive_ign_obj_node *node =
kmem_zalloc(sizeof (*node),
KM_SLEEP);
node->object = drro->drr_object;
#ifdef ZFS_DEBUG
struct receive_ign_obj_node *last_object =
list_tail(&ra->ignore_obj_list);
uint64_t last_objnum = (last_object != NULL ?
last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
#endif
list_insert_tail(&ra->ignore_obj_list, node);
objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
}
return (err);
@ -2704,7 +2763,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
return (0);
}
/*
* Read in the stream's records, one by one, and apply them to the pool. There
* are two threads involved; the thread that calls this function will spin up a
@ -2739,8 +2797,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
sizeof (ra.bytes_read), 1, &ra.bytes_read);
}
list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
offsetof(struct receive_ign_obj_node, node));
objlist_create(&ra.ignore_objlist);
/* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@ -2894,12 +2951,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
}
*voffp = ra.voff;
for (struct receive_ign_obj_node *n =
list_remove_head(&ra.ignore_obj_list); n != NULL;
n = list_remove_head(&ra.ignore_obj_list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&ra.ignore_obj_list);
objlist_destroy(&ra.ignore_objlist);
return (err);
}

View File

@ -25,7 +25,7 @@
/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_IMPL_H
@ -296,7 +296,6 @@ typedef struct dmu_sendarg {
uint64_t dsa_toguid;
int dsa_err;
dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental;
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
@ -126,6 +126,16 @@ typedef enum dmu_send_resume_token_version {
#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
/*
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
* records that are created by the sending process. This means that the send
* stream can be received as a clone, even though it is not an incremental.
* This is not implemented as a feature flag, because the receiving side does
* not need to have implemented it to receive this stream; it is fully backwards
* compatible. We need a flag, though, because full send streams without it
* cannot necessarily be received as a clone correctly.
*/
#define DRR_FLAG_FREERECORDS (1<<2)
/*
* flags in the drr_checksumflags field in the DRR_WRITE and