1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-17 10:26:15 +00:00

6393 zfs receive a full send as a clone

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Dan McDonald <danmcd@omniti.com>
Author: Paul Dagnelie <pcd@delphix.com>

illumos/illumos-gate@68ecb2ec93
This commit is contained in:
Alexander Motin 2016-01-26 13:09:16 +00:00
parent d1e5f965a6
commit cce747b2cb
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/vendor-sys/illumos/dist/; revision=294814
3 changed files with 117 additions and 56 deletions

View File

@ -137,6 +137,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
return (0);
}
/*
* Fill in the drr_free struct, or perform aggregation if the previous record is
* also a free record, and the two are adjacent.
*
* Note that we send free records even for a full send, because we want to be
* able to receive a full send as a clone, which requires a list of all the free
* and freeobject records that were generated on the source.
*/
static int
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
uint64_t length)
@ -160,15 +168,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
(object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset));
/*
* If we are doing a non-incremental send, then there can't
* be any data in the dataset we're receiving into. Therefore
* a free record would simply be a no-op. Save space by not
* sending it to begin with.
*/
if (!dsp->dsa_incremental)
return (0);
if (length != -1ULL && offset + length < offset)
length = -1ULL;
@ -347,10 +346,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
{
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
/* See comment in dump_free(). */
if (!dsp->dsa_incremental)
return (0);
/*
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for
@ -750,6 +745,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
if (ancestor_zb != NULL) {
drr->drr_u.drr_begin.drr_fromguid =
@ -772,7 +768,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_off = off;
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags;
dsp->dsa_resume_object = resumeobj;
dsp->dsa_resume_offset = resumeoff;
@ -1286,7 +1281,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */
if (flags & DRR_FLAG_CLONE) {
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
@ -1305,6 +1300,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
drba->drba_origin))
return (SET_ERROR(ENOENT));
/*
* If we're receiving a full send as a clone, and it doesn't
* contain all the necessary free records and freeobject
* records, reject it.
*/
if (fromguid == 0 && drba->drba_origin &&
!(flags & DRR_FLAG_FREERECORDS))
return (SET_ERROR(EINVAL));
/* Open the parent of tofs */
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@ -1344,7 +1348,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
fromguid != 0) {
dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENODEV));
@ -1674,6 +1679,20 @@ struct receive_writer_arg {
uint64_t bytes_read; /* bytes read when current record created */
};
struct objlist {
list_t list; /* List of struct receive_objnode. */
/*
* Last object looked up. Used to assert that objects are being looked
* up in ascending order.
*/
uint64_t last_lookup;
};
struct receive_objnode {
list_node_t node;
uint64_t object;
};
struct receive_arg {
objset_t *os;
vnode_t *vp; /* The vnode to read the stream from */
@ -1691,12 +1710,7 @@ struct receive_arg {
int err;
boolean_t byteswap;
/* Sorted list of objects not to issue prefetches for. */
list_t ignore_obj_list;
};
struct receive_ign_obj_node {
list_node_t node;
uint64_t object;
struct objlist ignore_objlist;
};
typedef struct guid_map_entry {
@ -2008,13 +2022,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
{
uint64_t obj;
int next_err = 0;
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
for (obj = drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
if (dmu_object_info(rwa->os, obj, NULL) != 0)
@ -2024,7 +2039,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
}
if (next_err != ESRCH)
return (next_err);
return (0);
}
@ -2354,6 +2370,66 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
return (0);
}
static void
objlist_create(struct objlist *list)
{
list_create(&list->list, sizeof (struct receive_objnode),
offsetof(struct receive_objnode, node));
list->last_lookup = 0;
}
static void
objlist_destroy(struct objlist *list)
{
for (struct receive_objnode *n = list_remove_head(&list->list);
n != NULL; n = list_remove_head(&list->list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&list->list);
}
/*
* This function looks through the objlist to see if the specified object number
* is contained in the objlist. In the process, it will remove all object
* numbers in the list that are smaller than the specified object number. Thus,
* any lookup of an object number smaller than a previously looked up object
* number will always return false; therefore, all lookups should be done in
* ascending order.
*/
static boolean_t
objlist_exists(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = list_head(&list->list);
ASSERT3U(object, >=, list->last_lookup);
list->last_lookup = object;
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&list->list));
kmem_free(node, sizeof (*node));
node = list_head(&list->list);
}
return (node != NULL && node->object == object);
}
/*
* The objlist is a list of object numbers stored in ascending order. However,
* the insertion of new object numbers does not seek out the correct location to
* store a new object number; instead, it appends it to the list for simplicity.
* Thus, any users must take care to only insert new object numbers in ascending
* order.
*/
static void
objlist_insert(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
node->object = object;
#ifdef ZFS_DEBUG
struct receive_objnode *last_object = list_tail(&list->list);
uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
#endif
list_insert_tail(&list->list, node);
}
/*
* Issue the prefetch reads for any necessary indirect blocks.
*
@ -2376,13 +2452,7 @@ static void
receive_read_prefetch(struct receive_arg *ra,
uint64_t object, uint64_t offset, uint64_t length)
{
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
kmem_free(node, sizeof (*node));
node = list_head(&ra->ignore_obj_list);
}
if (node == NULL || node->object > object) {
if (!objlist_exists(&ra->ignore_objlist, object)) {
dmu_prefetch(ra->os, object, 1, offset, length,
ZIO_PRIORITY_SYNC_READ);
}
@ -2415,18 +2485,7 @@ receive_read_record(struct receive_arg *ra)
*/
if (err == ENOENT ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
struct receive_ign_obj_node *node =
kmem_zalloc(sizeof (*node),
KM_SLEEP);
node->object = drro->drr_object;
#ifdef ZFS_DEBUG
struct receive_ign_obj_node *last_object =
list_tail(&ra->ignore_obj_list);
uint64_t last_objnum = (last_object != NULL ?
last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
#endif
list_insert_tail(&ra->ignore_obj_list, node);
objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
}
return (err);
@ -2643,7 +2702,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
return (0);
}
/*
* Read in the stream's records, one by one, and apply them to the pool. There
* are two threads involved; the thread that calls this function will spin up a
@ -2677,8 +2735,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
sizeof (ra.bytes_read), 1, &ra.bytes_read);
}
list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
offsetof(struct receive_ign_obj_node, node));
objlist_create(&ra.ignore_objlist);
/* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@ -2832,12 +2889,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
}
*voffp = ra.voff;
for (struct receive_ign_obj_node *n =
list_remove_head(&ra.ignore_obj_list); n != NULL;
n = list_remove_head(&ra.ignore_obj_list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&ra.ignore_obj_list);
objlist_destroy(&ra.ignore_objlist);
return (err);
}

View File

@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_IMPL_H
@ -293,7 +293,6 @@ typedef struct dmu_sendarg {
uint64_t dsa_toguid;
int dsa_err;
dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental;
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
@ -126,6 +126,16 @@ typedef enum dmu_send_resume_token_version {
#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
/*
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
* records that are created by the sending process. This means that the send
* stream can be received as a clone, even though it is not an incremental.
* This is not implemented as a feature flag, because the receiving side does
* not need to have implemented it to receive this stream; it is fully backwards
* compatible. We need a flag, though, because full send streams without it
* cannot necessarily be received as a clone correctly.
*/
#define DRR_FLAG_FREERECORDS (1<<2)
/*
* flags in the drr_checksumflags field in the DRR_WRITE and