Make ZFS filesystem id persistent across different machines
[zfs.git] / module / zfs / dmu_send.c
index 6b00b73..9f90037 100644 (file)
@@ -20,6 +20,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+
+/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+int zfs_send_corrupt_data = B_FALSE;
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 
@@ -88,6 +95,9 @@ dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
 {
        struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
 
+       if (length != -1ULL && offset + length < offset)
+               length = -1ULL;
+
        /*
         * If there is a pending op, but it's not PENDING_FREE, push it out,
         * since free block aggregation can only be done for blocks of the
@@ -364,8 +374,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 
                if (dsl_read(NULL, spa, bp, pbuf,
                    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
-                   ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
-                       return (EIO);
+                   ZIO_FLAG_CANFAIL, &aflags, zb) != 0) {
+                       if (zfs_send_corrupt_data) {
+                               uint64_t *ptr;
+                               /* Send a block filled with 0x"zfs badd bloc" */
+                               abuf = arc_buf_alloc(spa, blksz, &abuf,
+                                   ARC_BUFC_DATA);
+                               for (ptr = abuf->b_data;
+                                   (char *)ptr < (char *)abuf->b_data + blksz;
+                                   ptr++)
+                                       *ptr = 0x2f5baddb10c;
+                       } else {
+                               return (EIO);
+                       }
+               }
 
                err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
                    blksz, bp, abuf->b_data);
@@ -494,6 +516,85 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
        return (0);
 }
 
+int
+dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+    uint64_t *sizep)
+{
+       dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+       dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       int err;
+       uint64_t size, recordsize;
+
+       /* tosnap must be a snapshot */
+       if (ds->ds_phys->ds_next_snap_obj == 0)
+               return (EINVAL);
+
+       /* fromsnap must be an earlier snapshot from the same fs as tosnap */
+       if (fromds && (ds->ds_dir != fromds->ds_dir ||
+           fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
+               return (EXDEV);
+
+       if (fromorigin) {
+               if (fromsnap)
+                       return (EINVAL);
+
+               if (dsl_dir_is_clone(ds->ds_dir)) {
+                       rw_enter(&dp->dp_config_rwlock, RW_READER);
+                       err = dsl_dataset_hold_obj(dp,
+                           ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
+                       rw_exit(&dp->dp_config_rwlock);
+                       if (err)
+                               return (err);
+               } else {
+                       fromorigin = B_FALSE;
+               }
+       }
+
+       /* Get uncompressed size estimate of changed data. */
+       if (fromds == NULL) {
+               size = ds->ds_phys->ds_uncompressed_bytes;
+       } else {
+               uint64_t used, comp;
+               err = dsl_dataset_space_written(fromds, ds,
+                   &used, &comp, &size);
+               if (fromorigin)
+                       dsl_dataset_rele(fromds, FTAG);
+               if (err)
+                       return (err);
+       }
+
+       /*
+        * Assume that space (both on-disk and in-stream) is dominated by
+        * data.  We will adjust for indirect blocks and the copies property,
+        * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+        */
+
+       /*
+        * Subtract out approximate space used by indirect blocks.
+        * Assume most space is used by data blocks (non-indirect, non-dnode).
+        * Assume all blocks are recordsize.  Assume ditto blocks and
+        * internal fragmentation counter out compression.
+        *
+        * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+        * block, which we observe in practice.
+        */
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       err = dsl_prop_get_ds(ds, "recordsize",
+           sizeof (recordsize), 1, &recordsize, NULL);
+       rw_exit(&dp->dp_config_rwlock);
+       if (err)
+               return (err);
+       size -= size / recordsize * sizeof (blkptr_t);
+
+       /* Add in the space for the record associated with each block. */
+       size += size / recordsize * sizeof (dmu_replay_record_t);
+
+       *sizep = size;
+
+       return (0);
+}
+
 struct recvbeginsyncarg {
        const char *tofs;
        const char *tosnap;
@@ -573,6 +674,14 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
                return (ETXTBSY);
 
+       /* new snapshot name must not exist */
+       err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+           ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+       if (err == 0)
+               return (EEXIST);
+       if (err != ENOENT)
+               return (err);
+
        if (rbsa->fromguid) {
                /* if incremental, most recent snapshot must match fromguid */
                if (ds->ds_prev == NULL)
@@ -620,13 +729,6 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (err != ENOENT)
                return (err);
 
-       /* new snapshot name must not exist */
-       err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
-       if (err == 0)
-               return (EEXIST);
-       if (err != ENOENT)
-               return (err);
        return (0);
 }
 
@@ -661,7 +763,6 @@ recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
            dp->dp_spa, tx, "dataset = %lld", dsobj);
 }
 
-
 static boolean_t
 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
 {
@@ -786,7 +887,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
                        return (err);
 
                if (dmu_recv_verify_features(ds, drrb)) {
-                       dsl_dataset_rele(ds, dmu_recv_tag);
+                       dsl_dataset_rele(ds, FTAG);
                        return (ENOTSUP);
                }
 
@@ -810,7 +911,7 @@ struct restorearg {
        uint64_t voff;
        int bufsize; /* amount of memory allocated for buf */
        zio_cksum_t cksum;
-       avl_tree_t guid_to_ds_map;
+       avl_tree_t *guid_to_ds_map;
 };
 
 typedef struct guid_map_entry {
@@ -832,59 +933,19 @@ guid_compare(const void *arg1, const void *arg2)
        return (0);
 }
 
-/*
- * This function is a callback used by dmu_objset_find() (which
- * enumerates the object sets) to build an avl tree that maps guids
- * to datasets.  The resulting table is used when processing DRR_WRITE_BYREF
- * send stream records.  These records, which are used in dedup'ed
- * streams, do not contain data themselves, but refer to a copy
- * of the data block that has already been written because it was
- * earlier in the stream.  That previous copy is identified by the
- * guid of the dataset with the referenced data.
- */
-int
-find_ds_by_guid(const char *name, void *arg)
+static void
+free_guid_map_onexit(void *arg)
 {
-       avl_tree_t *guid_map = arg;
-       dsl_dataset_t *ds, *snapds;
+       avl_tree_t *ca = arg;
+       void *cookie = NULL;
        guid_map_entry_t *gmep;
-       dsl_pool_t *dp;
-       int err;
-       uint64_t lastobj, firstobj;
 
-       if (dsl_dataset_hold(name, FTAG, &ds) != 0)
-               return (0);
-
-       dp = ds->ds_dir->dd_pool;
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
-       lastobj = ds->ds_phys->ds_prev_snap_obj;
-
-       while (lastobj != firstobj) {
-               err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
-               if (err) {
-                       /*
-                        * Skip this snapshot and move on. It's not
-                        * clear why this would ever happen, but the
-                        * remainder of the snapshot streadm can be
-                        * processed.
-                        */
-                       rw_exit(&dp->dp_config_rwlock);
-                       dsl_dataset_rele(ds, FTAG);
-                       return (0);
-               }
-
-               gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
-               gmep->guid = snapds->ds_phys->ds_guid;
-               gmep->gme_ds = snapds;
-               avl_add(guid_map, gmep);
-               lastobj = snapds->ds_phys->ds_prev_snap_obj;
+       while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+               dsl_dataset_rele(gmep->gme_ds, ca);
+               kmem_free(gmep, sizeof (guid_map_entry_t));
        }
-
-       rw_exit(&dp->dp_config_rwlock);
-       dsl_dataset_rele(ds, FTAG);
-
-       return (0);
+       avl_destroy(ca);
+       kmem_free(ca, sizeof (avl_tree_t));
 }
 
 static void *
@@ -921,7 +982,7 @@ restore_read(struct restorearg *ra, int len)
        return (rv);
 }
 
-static void
+noinline static void
 backup_byteswap(dmu_replay_record_t *drr)
 {
 #define        DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
@@ -996,12 +1057,14 @@ backup_byteswap(dmu_replay_record_t *drr)
                DO64(drr_end.drr_checksum.zc_word[3]);
                DO64(drr_end.drr_toguid);
                break;
+       default:
+               break;
        }
 #undef DO64
 #undef DO32
 }
 
-static int
+noinline static int
 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 {
        int err;
@@ -1085,7 +1148,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 }
 
 /* ARGSUSED */
-static int
+noinline static int
 restore_freeobjects(struct restorearg *ra, objset_t *os,
     struct drr_freeobjects *drrfo)
 {
@@ -1109,7 +1172,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
        return (0);
 }
 
-static int
+noinline static int
 restore_write(struct restorearg *ra, objset_t *os,
     struct drr_write *drrw)
 {
@@ -1173,7 +1236,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
         */
        if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
                gmesrch.guid = drrwbr->drr_refguid;
-               if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
+               if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
                    &where)) == NULL) {
                        return (EINVAL);
                }
@@ -1183,8 +1246,9 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
                ref_os = os;
        }
 
-       if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
-           drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+       err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+           drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+       if (err)
                return (err);
 
        tx = dmu_tx_create(os);
@@ -1254,7 +1318,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
 }
 
 /* ARGSUSED */
-static int
+noinline static int
 restore_free(struct restorearg *ra, objset_t *os,
     struct drr_free *drrf)
 {
@@ -1276,13 +1340,13 @@ restore_free(struct restorearg *ra, objset_t *os,
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
-dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
+dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
+    int cleanup_fd, uint64_t *action_handlep)
 {
        struct restorearg ra = { 0 };
        dmu_replay_record_t *drr;
        objset_t *os;
        zio_cksum_t pcksum;
-       guid_map_entry_t *gmep;
        int featureflags;
 
        if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
@@ -1318,7 +1382,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        ra.vp = vp;
        ra.voff = *voffp;
        ra.bufsize = 1<<20;
-       ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+       ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
        ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
@@ -1336,12 +1400,37 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 
        /* if this stream is dedup'ed, set up the avl tree for guid mapping */
        if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-               avl_create(&ra.guid_to_ds_map, guid_compare,
-                   sizeof (guid_map_entry_t),
-                   offsetof(guid_map_entry_t, avlnode));
-               (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
-                   (void *)&ra.guid_to_ds_map,
-                   DS_FIND_CHILDREN);
+               minor_t minor;
+
+               if (cleanup_fd == -1) {
+                       ra.err = EBADF;
+                       goto out;
+               }
+               ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+               if (ra.err) {
+                       cleanup_fd = -1;
+                       goto out;
+               }
+
+               if (*action_handlep == 0) {
+                       ra.guid_to_ds_map =
+                           kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+                       avl_create(ra.guid_to_ds_map, guid_compare,
+                           sizeof (guid_map_entry_t),
+                           offsetof(guid_map_entry_t, avlnode));
+                       ra.err = zfs_onexit_add_cb(minor,
+                           free_guid_map_onexit, ra.guid_to_ds_map,
+                           action_handlep);
+                       if (ra.err)
+                               goto out;
+               } else {
+                       ra.err = zfs_onexit_cb_data(minor, *action_handlep,
+                           (void **)&ra.guid_to_ds_map);
+                       if (ra.err)
+                               goto out;
+               }
+
+               drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
        }
 
        /*
@@ -1423,6 +1512,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        ASSERT(ra.err != 0);
 
 out:
+       if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+               zfs_onexit_fd_rele(cleanup_fd);
+
        if (ra.err != 0) {
                /*
                 * destroy what we created, so we don't leave it in the
@@ -1438,17 +1530,7 @@ out:
                }
        }
 
-       if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-               void *cookie = NULL;
-
-               while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
-                       dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
-                       kmem_free(gmep, sizeof (guid_map_entry_t));
-               }
-               avl_destroy(&ra.guid_to_ds_map);
-       }
-
-       kmem_free(ra.buf, ra.bufsize);
+       vmem_free(ra.buf, ra.bufsize);
        *voffp = ra.voff;
        return (ra.err);
 }
@@ -1487,11 +1569,35 @@ recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static int
+add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
+       dsl_dataset_t *snapds;
+       guid_map_entry_t *gmep;
+       int err;
+
+       ASSERT(guid_map != NULL);
+
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
+       if (err == 0) {
+               gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
+               gmep->guid = snapds->ds_phys->ds_guid;
+               gmep->gme_ds = snapds;
+               avl_add(guid_map, gmep);
+       }
+
+       rw_exit(&dp->dp_config_rwlock);
+       return (err);
+}
+
+static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
        struct recvendsyncarg resa;
        dsl_dataset_t *ds = drc->drc_logical_ds;
-       int err;
+       int err, myerr;
 
        /*
         * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
@@ -1526,8 +1632,11 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 
 out:
        mutex_exit(&ds->ds_recvlock);
+       if (err == 0 && drc->drc_guid_to_ds_map != NULL)
+               (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
        dsl_dataset_disown(ds, dmu_recv_tag);
-       (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
+       myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
+       ASSERT3U(myerr, ==, 0);
        return (err);
 }
 
@@ -1555,6 +1664,8 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
                /* clean up the fs we just recv'd into */
                (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
        } else {
+               if (drc->drc_guid_to_ds_map != NULL)
+                       (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
                /* release the hold from dmu_recv_begin */
                dsl_dataset_disown(ds, dmu_recv_tag);
        }