Update core ZFS code from build 121 to build 141.
authorBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 28 May 2010 20:45:14 +0000 (13:45 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 28 May 2010 20:45:14 +0000 (13:45 -0700)
174 files changed:
ZFS.RELEASE
cmd/zdb/zdb.c
cmd/zdb/zdb_il.c
cmd/zfs/zfs_iter.c
cmd/zfs/zfs_iter.h
cmd/zfs/zfs_main.c
cmd/zfs/zfs_util.h
cmd/zinject/translate.c
cmd/zinject/zinject.c
cmd/zinject/zinject.h
cmd/zpool/zpool_main.c
cmd/zpool/zpool_util.c
cmd/zpool/zpool_util.h
cmd/zpool/zpool_vdev.c
cmd/ztest/ztest.c
lib/libnvpair/include/libnvpair.h
lib/libnvpair/libnvpair.c
lib/libzfs/include/libzfs.h
lib/libzfs/include/libzfs_impl.h
lib/libzfs/libzfs_changelist.c
lib/libzfs/libzfs_config.c
lib/libzfs/libzfs_dataset.c
lib/libzfs/libzfs_fru.c [new file with mode: 0644]
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_mount.c
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_sendrecv.c
lib/libzfs/libzfs_status.c
lib/libzfs/libzfs_util.c
lib/libzpool/include/sys/zfs_context.h
lib/libzpool/kernel.c
lib/libzpool/taskq.c
lib/libzpool/util.c
module/avl/avl.c
module/avl/include/sys/avl.h
module/nvpair/include/sys/nvpair.h
module/nvpair/nvpair.c
module/zcommon/include/sys/fs/zfs.h
module/zcommon/include/zfs_comutil.h
module/zcommon/include/zfs_fletcher.h [new file with mode: 0644]
module/zcommon/include/zfs_prop.h
module/zcommon/zfs_comutil.c
module/zcommon/zfs_fletcher.c [moved from module/zfs/fletcher.c with 99% similarity]
module/zcommon/zfs_prop.c
module/zcommon/zpool_prop.c
module/zcommon/zprop_common.c
module/zfs/arc.c
module/zfs/bplist.c
module/zfs/bpobj.c [new file with mode: 0644]
module/zfs/dbuf.c
module/zfs/ddt.c [new file with mode: 0644]
module/zfs/ddt_zap.c [new file with mode: 0644]
module/zfs/dmu.c
module/zfs/dmu_object.c
module/zfs/dmu_objset.c
module/zfs/dmu_send.c
module/zfs/dmu_traverse.c
module/zfs/dmu_tx.c
module/zfs/dmu_zfetch.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
module/zfs/dsl_dataset.c
module/zfs/dsl_deadlist.c [new file with mode: 0644]
module/zfs/dsl_deleg.c
module/zfs/dsl_dir.c
module/zfs/dsl_pool.c
module/zfs/dsl_prop.c
module/zfs/dsl_scan.c [new file with mode: 0644]
module/zfs/dsl_scrub.c [deleted file]
module/zfs/dsl_synctask.c
module/zfs/fm.c
module/zfs/include/sys/arc.h
module/zfs/include/sys/bplist.h
module/zfs/include/sys/bpobj.h [new file with mode: 0644]
module/zfs/include/sys/dbuf.h
module/zfs/include/sys/ddt.h [new file with mode: 0644]
module/zfs/include/sys/dmu.h
module/zfs/include/sys/dmu_impl.h
module/zfs/include/sys/dmu_objset.h
module/zfs/include/sys/dmu_traverse.h
module/zfs/include/sys/dmu_tx.h
module/zfs/include/sys/dmu_zfetch.h
module/zfs/include/sys/dnode.h
module/zfs/include/sys/dsl_dataset.h
module/zfs/include/sys/dsl_deadlist.h [new file with mode: 0644]
module/zfs/include/sys/dsl_dir.h
module/zfs/include/sys/dsl_pool.h
module/zfs/include/sys/dsl_prop.h
module/zfs/include/sys/dsl_scan.h [new file with mode: 0644]
module/zfs/include/sys/dsl_synctask.h
module/zfs/include/sys/fm/fs/zfs.h
module/zfs/include/sys/fm/protocol.h
module/zfs/include/sys/metaslab.h
module/zfs/include/sys/metaslab_impl.h
module/zfs/include/sys/refcount.h
module/zfs/include/sys/sa.h [new file with mode: 0644]
module/zfs/include/sys/sa_impl.h [new file with mode: 0644]
module/zfs/include/sys/spa.h
module/zfs/include/sys/spa_impl.h
module/zfs/include/sys/space_map.h
module/zfs/include/sys/txg.h
module/zfs/include/sys/txg_impl.h
module/zfs/include/sys/uberblock.h
module/zfs/include/sys/uberblock_impl.h
module/zfs/include/sys/vdev.h
module/zfs/include/sys/vdev_impl.h
module/zfs/include/sys/zap.h
module/zfs/include/sys/zap_impl.h
module/zfs/include/sys/zap_leaf.h
module/zfs/include/sys/zfs_acl.h
module/zfs/include/sys/zfs_context.h
module/zfs/include/sys/zfs_ctldir.h
module/zfs/include/sys/zfs_debug.h
module/zfs/include/sys/zfs_dir.h
module/zfs/include/sys/zfs_fuid.h
module/zfs/include/sys/zfs_ioctl.h
module/zfs/include/sys/zfs_sa.h [new file with mode: 0644]
module/zfs/include/sys/zfs_vfsops.h
module/zfs/include/sys/zfs_znode.h
module/zfs/include/sys/zil.h
module/zfs/include/sys/zil_impl.h
module/zfs/include/sys/zio.h
module/zfs/include/sys/zio_checksum.h
module/zfs/include/sys/zio_compress.h
module/zfs/include/sys/zio_impl.h
module/zfs/include/sys/zvol.h
module/zfs/lzjb.c
module/zfs/metaslab.c
module/zfs/refcount.c
module/zfs/sa.c [new file with mode: 0644]
module/zfs/sha256.c
module/zfs/spa.c
module/zfs/spa_boot.c
module/zfs/spa_config.c
module/zfs/spa_errlog.c
module/zfs/spa_history.c
module/zfs/spa_misc.c
module/zfs/space_map.c
module/zfs/txg.c
module/zfs/uberblock.c
module/zfs/vdev.c
module/zfs/vdev_cache.c
module/zfs/vdev_file.c
module/zfs/vdev_label.c
module/zfs/vdev_mirror.c
module/zfs/vdev_missing.c
module/zfs/vdev_queue.c
module/zfs/vdev_raidz.c
module/zfs/vdev_root.c
module/zfs/zap.c
module/zfs/zap_leaf.c
module/zfs/zap_micro.c
module/zfs/zfs_acl.c
module/zfs/zfs_byteswap.c
module/zfs/zfs_ctldir.c
module/zfs/zfs_debug.c [new file with mode: 0644]
module/zfs/zfs_dir.c
module/zfs/zfs_fm.c
module/zfs/zfs_fuid.c
module/zfs/zfs_ioctl.c
module/zfs/zfs_log.c
module/zfs/zfs_replay.c
module/zfs/zfs_rlock.c
module/zfs/zfs_sa.c [new file with mode: 0644]
module/zfs/zfs_vfsops.c
module/zfs/zfs_vnops.c
module/zfs/zfs_znode.c
module/zfs/zil.c
module/zfs/zio.c
module/zfs/zio_checksum.c
module/zfs/zio_compress.c
module/zfs/zio_inject.c
module/zfs/zle.c [new file with mode: 0644]
scripts/zfs-update.sh

index 0960bf4..dd19a8e 100644 (file)
@@ -1 +1 @@
-http://dlc.sun.com/osol/on/downloads/b121/on-src.tar.bz2
+ssh://anon@hg.opensolaris.org/hg/onnv/onnv-gate/onnv_141
index 292bb51..ff73072 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <stdio.h>
@@ -34,6 +33,9 @@
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
+#include <sys/ddt.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
 
+#define        ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
+    zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define        ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
+    zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define        ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
+    dmu_ot[(idx)].ot_name : "UNKNOWN")
+#define        ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES)
+
+#ifndef lint
+extern int zfs_recover;
+#else
+int zfs_recover;
+#endif
+
 const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
@@ -64,8 +81,6 @@ extern void dump_intent_log(zilog_t *);
 uint64_t *zopt_object = NULL;
 int zopt_objects = 0;
 libzfs_handle_t *g_zfs;
-boolean_t zdb_sig_user_data = B_TRUE;
-int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -87,39 +102,56 @@ static void
 usage(void)
 {
        (void) fprintf(stderr,
-           "Usage: %s [-udibcsvL] [-U cachefile_path] [-t txg]\n"
-           "\t   [-S user:cksumalg] "
-           "dataset [object...]\n"
-           "       %s -C [pool]\n"
-           "       %s -l dev\n"
-           "       %s -R pool:vdev:offset:size:flags\n"
-           "       %s [-p path_to_vdev_dir]\n"
-           "       %s -e pool | GUID | devid ...\n",
-           cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
-
-       (void) fprintf(stderr, "        -u uberblock\n");
-       (void) fprintf(stderr, "        -d datasets\n");
-       (void) fprintf(stderr, "        -C cached pool configuration\n");
-       (void) fprintf(stderr, "        -i intent logs\n");
-       (void) fprintf(stderr, "        -b block statistics\n");
-       (void) fprintf(stderr, "        -m metaslabs\n");
-       (void) fprintf(stderr, "        -c checksum all metadata (twice for "
+           "Usage: %s [-CumdibcsDvhL] poolname [object...]\n"
+           "       %s [-div] dataset [object...]\n"
+           "       %s -m [-L] poolname [vdev [metaslab...]]\n"
+           "       %s -R poolname vdev:offset:size[:flags]\n"
+           "       %s -S poolname\n"
+           "       %s -l [-u] device\n"
+           "       %s -C\n\n",
+           cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
+
+       (void) fprintf(stderr, "    Dataset name must include at least one "
+           "separator character '/' or '@'\n");
+       (void) fprintf(stderr, "    If dataset name is specified, only that "
+           "dataset is dumped\n");
+       (void) fprintf(stderr, "    If object numbers are specified, only "
+           "those objects are dumped\n\n");
+       (void) fprintf(stderr, "    Options to control amount of output:\n");
+       (void) fprintf(stderr, "        -u uberblock\n");
+       (void) fprintf(stderr, "        -d dataset(s)\n");
+       (void) fprintf(stderr, "        -i intent logs\n");
+       (void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
+       (void) fprintf(stderr, "        -h pool history\n");
+       (void) fprintf(stderr, "        -b block statistics\n");
+       (void) fprintf(stderr, "        -m metaslabs\n");
+       (void) fprintf(stderr, "        -c checksum all metadata (twice for "
            "all data) blocks\n");
-       (void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
-       (void) fprintf(stderr, "        -S <user|all>:<cksum_alg|all> -- "
-           "dump blkptr signatures\n");
-       (void) fprintf(stderr, "        -v verbose (applies to all others)\n");
+       (void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
+       (void) fprintf(stderr, "        -D dedup statistics\n");
+       (void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
+       (void) fprintf(stderr, "        -v verbose (applies to all others)\n");
        (void) fprintf(stderr, "        -l dump label contents\n");
        (void) fprintf(stderr, "        -L disable leak tracking (do not "
            "load spacemaps)\n");
-       (void) fprintf(stderr, "        -U cachefile_path -- use alternate "
-           "cachefile\n");
        (void) fprintf(stderr, "        -R read and display block from a "
-           "device\n");
-       (void) fprintf(stderr, "        -e Pool is exported/destroyed/"
-           "has altroot\n");
-       (void) fprintf(stderr, "        -p <Path to vdev dir> (use with -e)\n");
-       (void) fprintf(stderr, "        -t <txg> highest txg to use when "
+           "device\n\n");
+       (void) fprintf(stderr, "    Below options are intended for use "
+           "with other options (except -l):\n");
+       (void) fprintf(stderr, "        -A ignore assertions (-A), enable "
+           "panic recovery (-AA) or both (-AAA)\n");
+       (void) fprintf(stderr, "        -F attempt automatic rewind within "
+           "safe range of transaction groups\n");
+       (void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
+           "cachefile\n");
+       (void) fprintf(stderr, "        -X attempt extreme rewind (does not "
+           "work with dataset)\n");
+       (void) fprintf(stderr, "        -e pool is exported/destroyed/"
+           "has altroot/not in a cachefile\n");
+       (void) fprintf(stderr, "        -p <path> -- use one or more with "
+           "-e to specify path to vdev dir\n");
+       (void) fprintf(stderr, "        -P print numbers parsable\n");
+       (void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
            "searching for uberblocks\n");
        (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
            "to make only that option verbose\n");
@@ -146,68 +178,6 @@ fatal(const char *fmt, ...)
        exit(1);
 }
 
-static void
-dump_nvlist(nvlist_t *list, int indent)
-{
-       nvpair_t *elem = NULL;
-
-       while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
-               switch (nvpair_type(elem)) {
-               case DATA_TYPE_STRING:
-                       {
-                               char *value;
-
-                               VERIFY(nvpair_value_string(elem, &value) == 0);
-                               (void) printf("%*s%s='%s'\n", indent, "",
-                                   nvpair_name(elem), value);
-                       }
-                       break;
-
-               case DATA_TYPE_UINT64:
-                       {
-                               uint64_t value;
-
-                               VERIFY(nvpair_value_uint64(elem, &value) == 0);
-                               (void) printf("%*s%s=%llu\n", indent, "",
-                                   nvpair_name(elem), (u_longlong_t)value);
-                       }
-                       break;
-
-               case DATA_TYPE_NVLIST:
-                       {
-                               nvlist_t *value;
-
-                               VERIFY(nvpair_value_nvlist(elem, &value) == 0);
-                               (void) printf("%*s%s\n", indent, "",
-                                   nvpair_name(elem));
-                               dump_nvlist(value, indent + 4);
-                       }
-                       break;
-
-               case DATA_TYPE_NVLIST_ARRAY:
-                       {
-                               nvlist_t **value;
-                               uint_t c, count;
-
-                               VERIFY(nvpair_value_nvlist_array(elem, &value,
-                                   &count) == 0);
-
-                               for (c = 0; c < count; c++) {
-                                       (void) printf("%*s%s[%u]\n", indent, "",
-                                           nvpair_name(elem), c);
-                                       dump_nvlist(value[c], indent + 8);
-                               }
-                       }
-                       break;
-
-               default:
-
-                       (void) printf("bad config type %d for %s\n",
-                           nvpair_type(elem), nvpair_name(elem));
-               }
-       }
-}
-
 /* ARGSUSED */
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
@@ -227,6 +197,15 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
        nvlist_free(nv);
 }
 
+static void
+zdb_nicenum(uint64_t num, char *buf)
+{
+       if (dump_opt['P'])
+               (void) sprintf(buf, "%llu", (longlong_t)num);
+       else
+               nicenum(num, buf);
+}
+
 const char dump_zap_stars[] = "****************************************";
 const int dump_zap_width = sizeof (dump_zap_stars) - 1;
 
@@ -325,6 +304,13 @@ dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 }
 
 /*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+       (void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
+/*ARGSUSED*/
 void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
@@ -388,6 +374,79 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 
 /*ARGSUSED*/
 static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+       dump_zap_stats(os, object);
+       /* contents are printed elsewhere, properly decoded */
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+       zap_cursor_t zc;
+       zap_attribute_t attr;
+
+       dump_zap_stats(os, object);
+       (void) printf("\n");
+
+       for (zap_cursor_init(&zc, os, object);
+           zap_cursor_retrieve(&zc, &attr) == 0;
+           zap_cursor_advance(&zc)) {
+               (void) printf("\t\t%s = ", attr.za_name);
+               if (attr.za_num_integers == 0) {
+                       (void) printf("\n");
+                       continue;
+               }
+               (void) printf(" %llx : [%d:%d:%d]\n",
+                   (u_longlong_t)attr.za_first_integer,
+                   (int)ATTR_LENGTH(attr.za_first_integer),
+                   (int)ATTR_BSWAP(attr.za_first_integer),
+                   (int)ATTR_NUM(attr.za_first_integer));
+       }
+       zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+       zap_cursor_t zc;
+       zap_attribute_t attr;
+       uint16_t *layout_attrs;
+       int i;
+
+       dump_zap_stats(os, object);
+       (void) printf("\n");
+
+       for (zap_cursor_init(&zc, os, object);
+           zap_cursor_retrieve(&zc, &attr) == 0;
+           zap_cursor_advance(&zc)) {
+               (void) printf("\t\t%s = [", attr.za_name);
+               if (attr.za_num_integers == 0) {
+                       (void) printf("\n");
+                       continue;
+               }
+
+               VERIFY(attr.za_integer_length == 2);
+               layout_attrs = umem_zalloc(attr.za_num_integers *
+                   attr.za_integer_length, UMEM_NOFAIL);
+
+               VERIFY(zap_lookup(os, object, attr.za_name,
+                   attr.za_integer_length,
+                   attr.za_num_integers, layout_attrs) == 0);
+
+               for (i = 0; i != attr.za_num_integers; i++)
+                       (void) printf(" %d ", (int)layout_attrs[i]);
+               (void) printf("]\n");
+               umem_free(layout_attrs,
+                   attr.za_num_integers * attr.za_integer_length);
+       }
+       zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
        zap_cursor_t zc;
@@ -441,17 +500,17 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
         */
        alloc = 0;
        for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
-               VERIFY(0 == dmu_read(os, smo->smo_object, offset,
+               VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset,
                    sizeof (entry), &entry, DMU_READ_PREFETCH));
                if (SM_DEBUG_DECODE(entry)) {
-                       (void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
+                       (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
                            (u_longlong_t)(offset / sizeof (entry)),
                            ddata[SM_DEBUG_ACTION_DECODE(entry)],
                            (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
                            (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
                } else {
-                       (void) printf("\t\t[%4llu]    %c  range:"
-                           " %08llx-%08llx  size: %06llx\n",
+                       (void) printf("\t    [%6llu]    %c  range:"
+                           " %010llx-%010llx  size: %06llx\n",
                            (u_longlong_t)(offset / sizeof (entry)),
                            SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
                            (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
@@ -476,14 +535,14 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
-       char maxbuf[5];
+       char maxbuf[32];
        space_map_t *sm = &msp->ms_map;
        avl_tree_t *t = sm->sm_pp_root;
        int free_pct = sm->sm_space * 100 / sm->sm_size;
 
-       nicenum(space_map_maxsize(sm), maxbuf);
+       zdb_nicenum(space_map_maxsize(sm), maxbuf);
 
-       (void) printf("\t %20s %10lu   %7s  %6s   %4s %4d%%\n",
+       (void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
            "segments", avl_numnodes(t), "maxsize", maxbuf,
            "freepct", free_pct);
 }
@@ -491,54 +550,86 @@ dump_metaslab_stats(metaslab_t *msp)
 static void
 dump_metaslab(metaslab_t *msp)
 {
-       char freebuf[5];
-       space_map_obj_t *smo = &msp->ms_smo;
        vdev_t *vd = msp->ms_group->mg_vd;
        spa_t *spa = vd->vdev_spa;
+       space_map_t *sm = &msp->ms_map;
+       space_map_obj_t *smo = &msp->ms_smo;
+       char freebuf[32];
 
-       nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+       zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf);
 
        (void) printf(
-           "\tvdev %5llu   offset %12llx   spacemap %6llu   free    %5s\n",
-           (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
-           (u_longlong_t)smo->smo_object, freebuf);
+           "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
+           (u_longlong_t)(sm->sm_start / sm->sm_size),
+           (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
 
-       if (dump_opt['m'] > 1) {
+       if (dump_opt['m'] > 1 && !dump_opt['L']) {
                mutex_enter(&msp->ms_lock);
-               VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
-                   SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
+               space_map_load_wait(sm);
+               if (!sm->sm_loaded)
+                       VERIFY(space_map_load(sm, zfs_metaslab_ops,
+                           SM_FREE, smo, spa->spa_meta_objset) == 0);
                dump_metaslab_stats(msp);
-               space_map_unload(&msp->ms_map);
+               space_map_unload(sm);
                mutex_exit(&msp->ms_lock);
        }
 
        if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
-               ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+               ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
 
                mutex_enter(&msp->ms_lock);
-               dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+               dump_spacemap(spa->spa_meta_objset, smo, sm);
                mutex_exit(&msp->ms_lock);
        }
+}
 
+static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+       (void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
+           (u_longlong_t)vd->vdev_id,
+           "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+           "offset", "spacemap", "free");
+       (void) printf("\t%15s   %19s   %15s   %10s\n",
+           "---------------", "-------------------",
+           "---------------", "-------------");
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
-       vdev_t *rvd = spa->spa_root_vdev;
-       vdev_t *vd;
-       int c, m;
+       vdev_t *vd, *rvd = spa->spa_root_vdev;
+       uint64_t m, c = 0, children = rvd->vdev_children;
 
        (void) printf("\nMetaslabs:\n");
 
-       for (c = 0; c < rvd->vdev_children; c++) {
-               vd = rvd->vdev_child[c];
+       if (!dump_opt['d'] && zopt_objects > 0) {
+               c = zopt_object[0];
+
+               if (c >= children)
+                       (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
-               (void) printf("\t%-10s   %-19s   %-15s   %-10s\n",
-                   "vdev", "offset", "spacemap", "free");
-               (void) printf("\t%10s   %19s   %15s   %10s\n",
-                   "----------", "-------------------",
-                   "---------------", "-------------");
+               if (zopt_objects > 1) {
+                       vd = rvd->vdev_child[c];
+                       print_vdev_metaslab_header(vd);
+
+                       for (m = 1; m < zopt_objects; m++) {
+                               if (zopt_object[m] < vd->vdev_ms_count)
+                                       dump_metaslab(
+                                           vd->vdev_ms[zopt_object[m]]);
+                               else
+                                       (void) fprintf(stderr, "bad metaslab "
+                                           "number %llu\n",
+                                           (u_longlong_t)zopt_object[m]);
+                       }
+                       (void) printf("\n");
+                       return;
+               }
+               children = c + 1;
+       }
+       for (; c < children; c++) {
+               vd = rvd->vdev_child[c];
+               print_vdev_metaslab_header(vd);
 
                for (m = 0; m < vd->vdev_ms_count; m++)
                        dump_metaslab(vd->vdev_ms[m]);
@@ -547,6 +638,133 @@ dump_metaslabs(spa_t *spa)
 }
 
 static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+       const ddt_phys_t *ddp = dde->dde_phys;
+       const ddt_key_t *ddk = &dde->dde_key;
+       char *types[4] = { "ditto", "single", "double", "triple" };
+       char blkbuf[BP_SPRINTF_LEN];
+       blkptr_t blk;
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (ddp->ddp_phys_birth == 0)
+                       continue;
+               ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+               sprintf_blkptr(blkbuf, &blk);
+               (void) printf("index %llx refcnt %llu %s %s\n",
+                   (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+                   types[p], blkbuf);
+       }
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+       double rL, rP, rD, D, dedup, compress, copies;
+
+       if (dds->dds_blocks == 0)
+               return;
+
+       rL = (double)dds->dds_ref_lsize;
+       rP = (double)dds->dds_ref_psize;
+       rD = (double)dds->dds_ref_dsize;
+       D = (double)dds->dds_dsize;
+
+       dedup = rD / D;
+       compress = rL / rP;
+       copies = rD / rP;
+
+       (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+           "dedup * compress / copies = %.2f\n\n",
+           dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+       char name[DDT_NAMELEN];
+       ddt_entry_t dde;
+       uint64_t walk = 0;
+       dmu_object_info_t doi;
+       uint64_t count, dspace, mspace;
+       int error;
+
+       error = ddt_object_info(ddt, type, class, &doi);
+
+       if (error == ENOENT)
+               return;
+       ASSERT(error == 0);
+
+       count = ddt_object_count(ddt, type, class);
+       dspace = doi.doi_physical_blocks_512 << 9;
+       mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+       ASSERT(count != 0);     /* we should have destroyed it */
+
+       ddt_object_name(ddt, type, class, name);
+
+       (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+           name,
+           (u_longlong_t)count,
+           (u_longlong_t)(dspace / count),
+           (u_longlong_t)(mspace / count));
+
+       if (dump_opt['D'] < 3)
+               return;
+
+       zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+       if (dump_opt['D'] < 4)
+               return;
+
+       if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+               return;
+
+       (void) printf("%s contents:\n\n", name);
+
+       while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+               dump_dde(ddt, &dde, walk);
+
+       ASSERT(error == ENOENT);
+
+       (void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+       ddt_histogram_t ddh_total = { 0 };
+       ddt_stat_t dds_total = { 0 };
+
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               dump_ddt(ddt, type, class);
+                       }
+               }
+       }
+
+       ddt_get_dedup_stats(spa, &dds_total);
+
+       if (dds_total.dds_blocks == 0) {
+               (void) printf("All DDTs are empty\n");
+               return;
+       }
+
+       (void) printf("\n");
+
+       if (dump_opt['D'] > 1) {
+               (void) printf("DDT histogram (aggregated over all DDTs):\n");
+               ddt_get_dedup_histogram(spa, &ddh_total);
+               zpool_dump_ddt(&dds_total, &ddh_total);
+       }
+
+       dump_dedup_ratio(&dds_total);
+}
+
+static void
 dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
 {
        char *prefix = (void *)sm;
@@ -566,7 +784,7 @@ dump_dtl(vdev_t *vd, int indent)
        char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
        char prefix[256];
 
-       spa_vdev_state_enter(spa);
+       spa_vdev_state_enter(spa, SCL_NONE);
        required = vdev_dtl_required(vd);
        (void) spa_vdev_state_exit(spa, NULL, 0);
 
@@ -596,6 +814,68 @@ dump_dtl(vdev_t *vd, int indent)
                dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
+static void
+dump_history(spa_t *spa)
+{
+       nvlist_t **events = NULL;
+       char buf[SPA_MAXBLOCKSIZE];
+       uint64_t resid, len, off = 0;
+       uint_t num = 0;
+       int error;
+       time_t tsec;
+       struct tm t;
+       char tbuf[30];
+       char internalstr[MAXPATHLEN];
+
+       do {
+               len = sizeof (buf);
+
+               if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+                       (void) fprintf(stderr, "Unable to read history: "
+                           "error %d\n", error);
+                       return;
+               }
+
+               if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+                       break;
+
+               off -= resid;
+       } while (len != 0);
+
+       (void) printf("\nHistory:\n");
+       for (int i = 0; i < num; i++) {
+               uint64_t time, txg, ievent;
+               char *cmd, *intstr;
+
+               if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+                   &time) != 0)
+                       continue;
+               if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+                   &cmd) != 0) {
+                       if (nvlist_lookup_uint64(events[i],
+                           ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+                               continue;
+                       verify(nvlist_lookup_uint64(events[i],
+                           ZPOOL_HIST_TXG, &txg) == 0);
+                       verify(nvlist_lookup_string(events[i],
+                           ZPOOL_HIST_INT_STR, &intstr) == 0);
+                       if (ievent >= LOG_END)
+                               continue;
+
+                       (void) snprintf(internalstr,
+                           sizeof (internalstr),
+                           "[internal %s txg:%lld] %s",
+                           zfs_history_event_names[ievent], txg,
+                           intstr);
+                       cmd = internalstr;
+               }
+               tsec = time;
+               (void) localtime_r(&tsec, &t);
+               (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+               (void) printf("%s %s\n", tbuf, cmd);
+       }
+}
+
 /*ARGSUSED*/
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
@@ -603,35 +883,48 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 }
 
 static uint64_t
-blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid)
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
 {
-       if (level < 0)
-               return (blkid);
+       if (dnp == NULL) {
+               ASSERT(zb->zb_level < 0);
+               if (zb->zb_object == 0)
+                       return (zb->zb_blkid);
+               return (zb->zb_blkid * BP_GET_LSIZE(bp));
+       }
+
+       ASSERT(zb->zb_level >= 0);
 
-       return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+       return ((zb->zb_blkid <<
+           (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
            dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
 {
-       dva_t *dva = bp->blk_dva;
-       int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
-       int i;
+       const dva_t *dva = bp->blk_dva;
+       int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+
+       if (dump_opt['b'] >= 5) {
+               sprintf_blkptr(blkbuf, bp);
+               return;
+       }
 
        blkbuf[0] = '\0';
 
-       for (i = 0; i < ndvas; i++)
+       for (int i = 0; i < ndvas; i++)
                (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
                    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
                    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
                    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
-       (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+       (void) sprintf(blkbuf + strlen(blkbuf),
+           "%llxL/%llxP F=%llu B=%llu/%llu",
            (u_longlong_t)BP_GET_LSIZE(bp),
            (u_longlong_t)BP_GET_PSIZE(bp),
            (u_longlong_t)bp->blk_fill,
-           (u_longlong_t)bp->blk_birth);
+           (u_longlong_t)bp->blk_birth,
+           (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
 }
 
 static void
@@ -644,8 +937,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
        ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
        ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 
-       (void) printf("%16llx ",
-           (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+       (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
        ASSERT(zb->zb_level >= 0);
 
@@ -657,23 +949,15 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
                }
        }
 
-       sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+       sprintf_blkptr_compact(blkbuf, bp);
        (void) printf("%s\n", blkbuf);
 }
 
-#define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-       (zb)->zb_objset = objset;                       \
-       (zb)->zb_object = object;                       \
-       (zb)->zb_level = level;                         \
-       (zb)->zb_blkid = blkid;                         \
-}
-
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_t *zb)
 {
-       int err;
+       int err = 0;
 
        if (bp->blk_birth == 0)
                return (0);
@@ -692,6 +976,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                if (err)
                        return (err);
+               ASSERT(buf->b_data);
 
                /* recursively visit blocks below this */
                cbp = buf->b_data;
@@ -724,11 +1009,11 @@ dump_indirect(dnode_t *dn)
 
        (void) printf("Indirect blocks:\n");
 
-       SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os),
+       SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
            dn->dn_object, dnp->dn_nlevels - 1, 0);
        for (j = 0; j < dnp->dn_nblkptr; j++) {
                czb.zb_blkid = j;
-               (void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp,
+               (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
                    &dnp->dn_blkptr[j], &czb);
        }
 
@@ -741,7 +1026,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
        dsl_dir_phys_t *dd = data;
        time_t crtime;
-       char nice[6];
+       char nice[32];
 
        if (dd == NULL)
                return;
@@ -758,15 +1043,15 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
            (u_longlong_t)dd->dd_origin_obj);
        (void) printf("\t\tchild_dir_zapobj = %llu\n",
            (u_longlong_t)dd->dd_child_dir_zapobj);
-       nicenum(dd->dd_used_bytes, nice);
+       zdb_nicenum(dd->dd_used_bytes, nice);
        (void) printf("\t\tused_bytes = %s\n", nice);
-       nicenum(dd->dd_compressed_bytes, nice);
+       zdb_nicenum(dd->dd_compressed_bytes, nice);
        (void) printf("\t\tcompressed_bytes = %s\n", nice);
-       nicenum(dd->dd_uncompressed_bytes, nice);
+       zdb_nicenum(dd->dd_uncompressed_bytes, nice);
        (void) printf("\t\tuncompressed_bytes = %s\n", nice);
-       nicenum(dd->dd_quota, nice);
+       zdb_nicenum(dd->dd_quota, nice);
        (void) printf("\t\tquota = %s\n", nice);
-       nicenum(dd->dd_reserved, nice);
+       zdb_nicenum(dd->dd_reserved, nice);
        (void) printf("\t\treserved = %s\n", nice);
        (void) printf("\t\tprops_zapobj = %llu\n",
            (u_longlong_t)dd->dd_props_zapobj);
@@ -776,7 +1061,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
            (u_longlong_t)dd->dd_flags);
 
 #define        DO(which) \
-       nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
+       zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
        (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
        DO(HEAD);
        DO(SNAP);
@@ -792,7 +1077,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
        dsl_dataset_phys_t *ds = data;
        time_t crtime;
-       char used[6], compressed[6], uncompressed[6], unique[6];
+       char used[32], compressed[32], uncompressed[32], unique[32];
        char blkbuf[BP_SPRINTF_LEN];
 
        if (ds == NULL)
@@ -800,11 +1085,11 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 
        ASSERT(size == sizeof (*ds));
        crtime = ds->ds_creation_time;
-       nicenum(ds->ds_used_bytes, used);
-       nicenum(ds->ds_compressed_bytes, compressed);
-       nicenum(ds->ds_uncompressed_bytes, uncompressed);
-       nicenum(ds->ds_unique_bytes, unique);
-       sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
+       zdb_nicenum(ds->ds_used_bytes, used);
+       zdb_nicenum(ds->ds_compressed_bytes, compressed);
+       zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
+       zdb_nicenum(ds->ds_unique_bytes, unique);
+       sprintf_blkptr(blkbuf, &ds->ds_bp);
 
        (void) printf("\t\tdir_obj = %llu\n",
            (u_longlong_t)ds->ds_dir_obj);
@@ -842,63 +1127,88 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
        (void) printf("\t\tbp = %s\n", blkbuf);
 }
 
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       char blkbuf[BP_SPRINTF_LEN];
+
+       ASSERT(bp->blk_birth != 0);
+       sprintf_blkptr_compact(blkbuf, bp);
+       (void) printf("\t%s\n", blkbuf);
+       return (0);
+}
+
 static void
-dump_bplist(objset_t *mos, uint64_t object, char *name)
+dump_bpobj(bpobj_t *bpo, char *name)
 {
-       bplist_t bpl = { 0 };
-       blkptr_t blk, *bp = &blk;
-       uint64_t itor = 0;
-       char bytes[6];
-       char comp[6];
-       char uncomp[6];
+       char bytes[32];
+       char comp[32];
+       char uncomp[32];
 
        if (dump_opt['d'] < 3)
                return;
 
-       mutex_init(&bpl.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
-       VERIFY(0 == bplist_open(&bpl, mos, object));
-       if (bplist_empty(&bpl)) {
-               bplist_close(&bpl);
-               mutex_destroy(&bpl.bpl_lock);
-               return;
-       }
-
-       nicenum(bpl.bpl_phys->bpl_bytes, bytes);
-       if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
-               nicenum(bpl.bpl_phys->bpl_comp, comp);
-               nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
-               (void) printf("\n    %s: %llu entries, %s (%s/%s comp)\n",
-                   name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
+       zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
+       if (bpo->bpo_havesubobj) {
+               zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
+               zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
+               (void) printf("\n    %s: %llu local blkptrs, %llu subobjs, "
+                   "%s (%s/%s comp)\n",
+                   name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+                   (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
                    bytes, comp, uncomp);
        } else {
-               (void) printf("\n    %s: %llu entries, %s\n",
-                   name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes);
+               (void) printf("\n    %s: %llu blkptrs, %s\n",
+                   name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes);
        }
 
-       if (dump_opt['d'] < 5) {
-               bplist_close(&bpl);
-               mutex_destroy(&bpl.bpl_lock);
+       if (dump_opt['d'] < 5)
                return;
-       }
 
        (void) printf("\n");
 
-       while (bplist_iterate(&bpl, &itor, bp) == 0) {
-               char blkbuf[BP_SPRINTF_LEN];
+       (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+}
 
-               ASSERT(bp->blk_birth != 0);
-               sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
-               (void) printf("\tItem %3llu: %s\n",
-                   (u_longlong_t)itor - 1, blkbuf);
-       }
+static void
+dump_deadlist(dsl_deadlist_t *dl)
+{
+       dsl_deadlist_entry_t *dle;
+       char bytes[32];
+       char comp[32];
+       char uncomp[32];
+
+       if (dump_opt['d'] < 3)
+               return;
 
-       bplist_close(&bpl);
-       mutex_destroy(&bpl.bpl_lock);
+       zdb_nicenum(dl->dl_phys->dl_used, bytes);
+       zdb_nicenum(dl->dl_phys->dl_comp, comp);
+       zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
+       (void) printf("\n    Deadlist: %s (%s/%s comp)\n",
+           bytes, comp, uncomp);
+
+       if (dump_opt['d'] < 4)
+               return;
+
+       (void) printf("\n");
+
+       for (dle = avl_first(&dl->dl_tree); dle;
+           dle = AVL_NEXT(&dl->dl_tree, dle)) {
+               (void) printf("      mintxg %llu -> obj %llu\n",
+                   (longlong_t)dle->dle_mintxg,
+                   (longlong_t)dle->dle_bpobj.bpo_object);
+
+               if (dump_opt['d'] >= 5)
+                       dump_bpobj(&dle->dle_bpobj, "");
+       }
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
+static boolean_t sa_loaded;
+sa_attr_type_t *sa_attr_table;
 
 static void
 fuid_table_destroy()
@@ -931,12 +1241,12 @@ print_idstr(uint64_t id, const char *id_type)
 }
 
 static void
-dump_uidgid(objset_t *os, znode_phys_t *zp)
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
        uint32_t uid_idx, gid_idx;
 
-       uid_idx = FUID_INDEX(zp->zp_uid);
-       gid_idx = FUID_INDEX(zp->zp_gid);
+       uid_idx = FUID_INDEX(uid);
+       gid_idx = FUID_INDEX(gid);
 
        /* Load domain table, if not already loaded */
        if (!fuid_table_loaded && (uid_idx || gid_idx)) {
@@ -951,50 +1261,107 @@ dump_uidgid(objset_t *os, znode_phys_t *zp)
                fuid_table_loaded = B_TRUE;
        }
 
-       print_idstr(zp->zp_uid, "uid");
-       print_idstr(zp->zp_gid, "gid");
+       print_idstr(uid, "uid");
+       print_idstr(gid, "gid");
 }
 
 /*ARGSUSED*/
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
-       znode_phys_t *zp = data;
-       time_t z_crtime, z_atime, z_mtime, z_ctime;
        char path[MAXPATHLEN * 2];      /* allow for xattr and failure prefix */
+       sa_handle_t *hdl;
+       uint64_t xattr, rdev, gen;
+       uint64_t uid, gid, mode, fsize, parent, links;
+       uint64_t pflags;
+       uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+       time_t z_crtime, z_atime, z_mtime, z_ctime;
+       sa_bulk_attr_t bulk[12];
+       int idx = 0;
        int error;
 
-       ASSERT(size >= sizeof (znode_phys_t));
+       if (!sa_loaded) {
+               uint64_t sa_attrs = 0;
+               uint64_t version;
+
+               VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+                   8, 1, &version) == 0);
+               if (version >= ZPL_VERSION_SA) {
+                       VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+                           8, 1, &sa_attrs) == 0);
+               }
+               sa_attr_table = sa_setup(os, sa_attrs,
+                   zfs_attr_table, ZPL_END);
+               sa_loaded = B_TRUE;
+       }
+
+       if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+               (void) printf("Failed to get handle for SA znode\n");
+               return;
+       }
+
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+           &links, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+           &mode, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+           NULL, &parent, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+           &fsize, 8);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+           acctm, 16);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+           modtm, 16);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+           crtm, 16);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+           chgtm, 16);
+       SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+           &pflags, 8);
+
+       if (sa_bulk_lookup(hdl, bulk, idx)) {
+               (void) sa_handle_destroy(hdl);
+               return;
+       }
 
        error = zfs_obj_to_path(os, object, path, sizeof (path));
        if (error != 0) {
                (void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
                    (u_longlong_t)object);
        }
-
        if (dump_opt['d'] < 3) {
                (void) printf("\t%s\n", path);
+               (void) sa_handle_destroy(hdl);
                return;
        }
 
-       z_crtime = (time_t)zp->zp_crtime[0];
-       z_atime = (time_t)zp->zp_atime[0];
-       z_mtime = (time_t)zp->zp_mtime[0];
-       z_ctime = (time_t)zp->zp_ctime[0];
+       z_crtime = (time_t)crtm[0];
+       z_atime = (time_t)acctm[0];
+       z_mtime = (time_t)modtm[0];
+       z_ctime = (time_t)chgtm[0];
 
        (void) printf("\tpath   %s\n", path);
-       dump_uidgid(os, zp);
+       dump_uidgid(os, uid, gid);
        (void) printf("\tatime  %s", ctime(&z_atime));
        (void) printf("\tmtime  %s", ctime(&z_mtime));
        (void) printf("\tctime  %s", ctime(&z_ctime));
        (void) printf("\tcrtime %s", ctime(&z_crtime));
-       (void) printf("\tgen    %llu\n", (u_longlong_t)zp->zp_gen);
-       (void) printf("\tmode   %llo\n", (u_longlong_t)zp->zp_mode);
-       (void) printf("\tsize   %llu\n", (u_longlong_t)zp->zp_size);
-       (void) printf("\tparent %llu\n", (u_longlong_t)zp->zp_parent);
-       (void) printf("\tlinks  %llu\n", (u_longlong_t)zp->zp_links);
-       (void) printf("\txattr  %llu\n", (u_longlong_t)zp->zp_xattr);
-       (void) printf("\trdev   0x%016llx\n", (u_longlong_t)zp->zp_rdev);
+       (void) printf("\tgen    %llu\n", (u_longlong_t)gen);
+       (void) printf("\tmode   %llo\n", (u_longlong_t)mode);
+       (void) printf("\tsize   %llu\n", (u_longlong_t)fsize);
+       (void) printf("\tparent %llu\n", (u_longlong_t)parent);
+       (void) printf("\tlinks  %llu\n", (u_longlong_t)links);
+       (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
+       if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+           sizeof (uint64_t)) == 0)
+               (void) printf("\txattr  %llu\n", (u_longlong_t)xattr);
+       if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+           sizeof (uint64_t)) == 0)
+               (void) printf("\trdev   0x%016llx\n", (u_longlong_t)rdev);
+       sa_handle_destroy(hdl);
 }
 
 /*ARGSUSED*/
@@ -1009,7 +1376,7 @@ dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
-static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
        dump_none,              /* unallocated                  */
        dump_zap,               /* object directory             */
        dump_uint64,            /* object array                 */
@@ -1052,6 +1419,19 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
        dump_zap,               /* ZFS user/group used          */
        dump_zap,               /* ZFS user/group quota         */
        dump_zap,               /* snapshot refcount tags       */
+       dump_ddt_zap,           /* DDT ZAP object               */
+       dump_zap,               /* DDT statistics               */
+       dump_znode,             /* SA object                    */
+       dump_zap,               /* SA Master Node               */
+       dump_sa_attrs,          /* SA attribute registration    */
+       dump_sa_layouts,        /* SA attribute layouts         */
+       dump_zap,               /* DSL scrub translations       */
+       dump_none,              /* fake dedup BP                */
+       dump_zap,               /* deadlist                     */
+       dump_none,              /* deadlist hdr                 */
+       dump_zap,               /* dsl clones                   */
+       dump_none,              /* bpobj subobjs                */
+       dump_unknown,           /* Unknown type, must be last   */
 };
 
 static void
@@ -1062,18 +1442,20 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
        dnode_t *dn;
        void *bonus = NULL;
        size_t bsize = 0;
-       char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
+       char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
+       char bonus_size[32];
        char aux[50];
        int error;
 
        if (*print_header) {
-               (void) printf("\n    Object  lvl   iblk   dblk  lsize"
-                   "  asize  type\n");
+               (void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+                   "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
+                   "%full", "type");
                *print_header = 0;
        }
 
        if (object == 0) {
-               dn = os->os->os_meta_dnode;
+               dn = os->os_meta_dnode;
        } else {
                error = dmu_bonus_hold(os, object, FTAG, &db);
                if (error)
@@ -1085,46 +1467,51 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
        }
        dmu_object_info_from_dnode(dn, &doi);
 
-       nicenum(doi.doi_metadata_block_size, iblk);
-       nicenum(doi.doi_data_block_size, dblk);
-       nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
-           lsize);
-       nicenum(doi.doi_physical_blks << 9, asize);
-       nicenum(doi.doi_bonus_size, bonus_size);
+       zdb_nicenum(doi.doi_metadata_block_size, iblk);
+       zdb_nicenum(doi.doi_data_block_size, dblk);
+       zdb_nicenum(doi.doi_max_offset, lsize);
+       zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
+       zdb_nicenum(doi.doi_bonus_size, bonus_size);
+       (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+           doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+           doi.doi_max_offset);
 
        aux[0] = '\0';
 
        if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
                (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
-                   zio_checksum_table[doi.doi_checksum].ci_name);
+                   ZDB_CHECKSUM_NAME(doi.doi_checksum));
        }
 
        if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
                (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
-                   zio_compress_table[doi.doi_compress].ci_name);
+                   ZDB_COMPRESS_NAME(doi.doi_compress));
        }
 
-       (void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
-           (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
-           asize, dmu_ot[doi.doi_type].ot_name, aux);
+       (void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
+           (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+           asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
        if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
-               (void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
-                   "", "", "", "", bonus_size, "bonus",
-                   dmu_ot[doi.doi_bonus_type].ot_name);
+               (void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+                   "", "", "", "", "", bonus_size, "bonus",
+                   ZDB_OT_NAME(doi.doi_bonus_type));
        }
 
        if (verbosity >= 4) {
-               (void) printf("\tdnode flags: %s%s\n",
+               (void) printf("\tdnode flags: %s%s%s\n",
                    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
                    "USED_BYTES " : "",
                    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
-                   "USERUSED_ACCOUNTED " : "");
+                   "USERUSED_ACCOUNTED " : "",
+                   (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+                   "SPILL_BLKPTR" : "");
                (void) printf("\tdnode maxblkid: %llu\n",
                    (longlong_t)dn->dn_phys->dn_maxblkid);
 
-               object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
-               object_viewer[doi.doi_type](os, object, NULL, 0);
+               object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
+                   bonus, bsize);
+               object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
                *print_header = 1;
        }
 
@@ -1146,6 +1533,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
                }
 
                for (;;) {
+                       char segsize[32];
                        error = dnode_next_offset(dn,
                            0, &start, minlvl, blkfill, 0);
                        if (error)
@@ -1153,7 +1541,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
                        end = start;
                        error = dnode_next_offset(dn,
                            DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
-                       nicenum(end - start, segsize);
+                       zdb_nicenum(end - start, segsize);
                        (void) printf("\t\tsegment [%016llx, %016llx)"
                            " size %5s\n", (u_longlong_t)start,
                            (u_longlong_t)end, segsize);
@@ -1176,7 +1564,7 @@ dump_dir(objset_t *os)
        dmu_objset_stats_t dds;
        uint64_t object, object_count;
        uint64_t refdbytes, usedobjs, scratch;
-       char numbuf[8];
+       char numbuf[32];
        char blkbuf[BP_SPRINTF_LEN + 20];
        char osname[MAXNAMELEN];
        char *type = "UNKNOWN";
@@ -1191,21 +1579,20 @@ dump_dir(objset_t *os)
 
        if (dds.dds_type == DMU_OST_META) {
                dds.dds_creation_txg = TXG_INITIAL;
-               usedobjs = os->os->os_rootbp->blk_fill;
-               refdbytes = os->os->os_spa->spa_dsl_pool->
+               usedobjs = os->os_rootbp->blk_fill;
+               refdbytes = os->os_spa->spa_dsl_pool->
                    dp_mos_dir->dd_phys->dd_used_bytes;
        } else {
                dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
        }
 
-       ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill);
+       ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
 
-       nicenum(refdbytes, numbuf);
+       zdb_nicenum(refdbytes, numbuf);
 
        if (verbosity >= 4) {
-               (void) sprintf(blkbuf + strlen(blkbuf), ", rootbp ");
-               (void) sprintf_blkptr(blkbuf + strlen(blkbuf),
-                   BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
+               (void) sprintf(blkbuf, ", rootbp ");
+               (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp);
        } else {
                blkbuf[0] = '\0';
        }
@@ -1218,18 +1605,6 @@ dump_dir(objset_t *os)
            (u_longlong_t)dds.dds_creation_txg,
            numbuf, (u_longlong_t)usedobjs, blkbuf);
 
-       dump_intent_log(dmu_objset_zil(os));
-
-       if (dmu_objset_ds(os) != NULL)
-               dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
-                   dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
-
-       if (verbosity < 2)
-               return;
-
-       if (os->os->os_rootbp->blk_birth == 0)
-               return;
-
        if (zopt_objects != 0) {
                for (i = 0; i < zopt_objects; i++)
                        dump_object(os, zopt_object[i], verbosity,
@@ -1238,10 +1613,22 @@ dump_dir(objset_t *os)
                return;
        }
 
+       if (dump_opt['i'] != 0 || verbosity >= 2)
+               dump_intent_log(dmu_objset_zil(os));
+
+       if (dmu_objset_ds(os) != NULL)
+               dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
+
+       if (verbosity < 2)
+               return;
+
+       if (os->os_rootbp->blk_birth == 0)
+               return;
+
        dump_object(os, 0, verbosity, &print_header);
        object_count = 0;
-       if (os->os->os_userused_dnode &&
-           os->os->os_userused_dnode->dn_type != 0) {
+       if (os->os_userused_dnode &&
+           os->os_userused_dnode->dn_type != 0) {
                dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
                dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
        }
@@ -1263,11 +1650,11 @@ dump_dir(objset_t *os)
 }
 
 static void
-dump_uberblock(uberblock_t *ub)
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
        time_t timestamp = ub->ub_timestamp;
 
-       (void) printf("Uberblock\n\n");
+       (void) printf(header ? header : "");
        (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
        (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
        (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
@@ -1276,28 +1663,37 @@ dump_uberblock(uberblock_t *ub)
            (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
        if (dump_opt['u'] >= 3) {
                char blkbuf[BP_SPRINTF_LEN];
-               sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
+               sprintf_blkptr(blkbuf, &ub->ub_rootbp);
                (void) printf("\trootbp = %s\n", blkbuf);
        }
-       (void) printf("\n");
+       (void) printf(footer ? footer : "");
 }
 
 static void
-dump_config(const char *pool)
+dump_config(spa_t *spa)
 {
-       spa_t *spa = NULL;
+       dmu_buf_t *db;
+       size_t nvsize = 0;
+       int error = 0;
 
-       mutex_enter(&spa_namespace_lock);
-       while ((spa = spa_next(spa)) != NULL) {
-               if (pool == NULL)
-                       (void) printf("%s\n", spa_name(spa));
-               if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
-                       dump_nvlist(spa->spa_config, 4);
-       }
-       mutex_exit(&spa_namespace_lock);
-}
 
-static void
+       error = dmu_bonus_hold(spa->spa_meta_objset,
+           spa->spa_config_object, FTAG, &db);
+
+       if (error == 0) {
+               nvsize = *(uint64_t *)db->db_data;
+               dmu_buf_rele(db, FTAG);
+
+               (void) printf("\nMOS Configuration:\n");
+               dump_packed_nvlist(spa->spa_meta_objset,
+                   spa->spa_config_object, (void *)&nvsize, 1);
+       } else {
+               (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+                   (u_longlong_t)spa->spa_config_object, error);
+       }
+}
+
+static void
 dump_cachefile(const char *cachefile)
 {
        int fd;
@@ -1343,33 +1739,75 @@ dump_cachefile(const char *cachefile)
        nvlist_free(config);
 }
 
+#define        ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+{
+       vdev_t vd;
+       vdev_t *vdp = &vd;
+       char header[ZDB_MAX_UB_HEADER_SIZE];
+
+       vd.vdev_ashift = ashift;
+       vdp->vdev_top = vdp;
+
+       for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+               uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+               uberblock_t *ub = (void *)((char *)lbl + uoff);
+
+               if (uberblock_verify(ub))
+                       continue;
+               (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+                   "Uberblock[%d]\n", i);
+               dump_uberblock(ub, header, "");
+       }
+}
+
 static void
 dump_label(const char *dev)
 {
        int fd;
        vdev_label_t label;
-       char *buf = label.vl_vdev_phys.vp_nvlist;
+       char *path, *buf = label.vl_vdev_phys.vp_nvlist;
        size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
        struct stat64 statbuf;
-       uint64_t psize;
-       int l;
+       uint64_t psize, ashift;
+       int len = strlen(dev) + 1;
 
-       if ((fd = open64(dev, O_RDONLY)) < 0) {
-               (void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+       if (strncmp(dev, "/dev/dsk/", 9) == 0) {
+               len++;
+               path = malloc(len);
+               (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
+       } else {
+               path = strdup(dev);
+       }
+
+       if ((fd = open64(path, O_RDONLY)) < 0) {
+               (void) printf("cannot open '%s': %s\n", path, strerror(errno));
+               free(path);
                exit(1);
        }
 
        if (fstat64(fd, &statbuf) != 0) {
-               (void) printf("failed to stat '%s': %s\n", dev,
+               (void) printf("failed to stat '%s': %s\n", path,
                    strerror(errno));
+               free(path);
+               (void) close(fd);
+               exit(1);
+       }
+
+       if (S_ISBLK(statbuf.st_mode)) {
+               (void) printf("cannot use '%s': character device required\n",
+                   path);
+               free(path);
+               (void) close(fd);
                exit(1);
        }
 
        psize = statbuf.st_size;
        psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 
-       for (l = 0; l < VDEV_LABELS; l++) {
-
+       for (int l = 0; l < VDEV_LABELS; l++) {
                nvlist_t *config = NULL;
 
                (void) printf("--------------------------------------------\n");
@@ -1384,105 +1822,47 @@ dump_label(const char *dev)
 
                if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
                        (void) printf("failed to unpack label %d\n", l);
-                       continue;
+                       ashift = SPA_MINBLOCKSHIFT;
+               } else {
+                       nvlist_t *vdev_tree = NULL;
+
+                       dump_nvlist(config, 4);
+                       if ((nvlist_lookup_nvlist(config,
+                           ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+                           (nvlist_lookup_uint64(vdev_tree,
+                           ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+                               ashift = SPA_MINBLOCKSHIFT;
+                       nvlist_free(config);
                }
-               dump_nvlist(config, 4);
-               nvlist_free(config);
+               if (dump_opt['u'])
+                       dump_label_uberblocks(&label, ashift);
        }
+
+       free(path);
+       (void) close(fd);
 }
 
 /*ARGSUSED*/
 static int
-dump_one_dir(char *dsname, void *arg)
+dump_one_dir(const char *dsname, void *arg)
 {
        int error;
        objset_t *os;
 
-       error = dmu_objset_open(dsname, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &os);
+       error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
        if (error) {
-               (void) printf("Could not open %s\n", dsname);
+               (void) printf("Could not open %s, error %d\n", dsname, error);
                return (0);
        }
        dump_dir(os);
-       dmu_objset_close(os);
+       dmu_objset_disown(os, FTAG);
        fuid_table_destroy();
+       sa_loaded = B_FALSE;
        return (0);
 }
 
-static void
-zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
-{
-       vdev_t *vd = sm->sm_ppd;
-
-       (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
-           (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
-}
-
-/* ARGSUSED */
-static void
-zdb_space_map_load(space_map_t *sm)
-{
-}
-
-static void
-zdb_space_map_unload(space_map_t *sm)
-{
-       space_map_vacate(sm, zdb_leak, sm);
-}
-
-/* ARGSUSED */
-static void
-zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-}
-
-static space_map_ops_t zdb_space_map_ops = {
-       zdb_space_map_load,
-       zdb_space_map_unload,
-       NULL,   /* alloc */
-       zdb_space_map_claim,
-       NULL,   /* free */
-       NULL    /* maxsize */
-};
-
-static void
-zdb_leak_init(spa_t *spa)
-{
-       vdev_t *rvd = spa->spa_root_vdev;
-
-       for (int c = 0; c < rvd->vdev_children; c++) {
-               vdev_t *vd = rvd->vdev_child[c];
-               for (int m = 0; m < vd->vdev_ms_count; m++) {
-                       metaslab_t *msp = vd->vdev_ms[m];
-                       mutex_enter(&msp->ms_lock);
-                       VERIFY(space_map_load(&msp->ms_map, &zdb_space_map_ops,
-                           SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0);
-                       msp->ms_map.sm_ppd = vd;
-                       mutex_exit(&msp->ms_lock);
-               }
-       }
-}
-
-static void
-zdb_leak_fini(spa_t *spa)
-{
-       vdev_t *rvd = spa->spa_root_vdev;
-
-       for (int c = 0; c < rvd->vdev_children; c++) {
-               vdev_t *vd = rvd->vdev_child[c];
-               for (int m = 0; m < vd->vdev_ms_count; m++) {
-                       metaslab_t *msp = vd->vdev_ms[m];
-                       mutex_enter(&msp->ms_lock);
-                       space_map_unload(&msp->ms_map);
-                       mutex_exit(&msp->ms_lock);
-               }
-       }
-}
-
 /*
- * Verify that the sum of the sizes of all blocks in the pool adds up
- * to the SPA's sa_alloc total.
+ * Block statistics.
  */
 typedef struct zdb_blkstats {
        uint64_t        zb_asize;
@@ -1491,24 +1871,45 @@ typedef struct zdb_blkstats {
        uint64_t        zb_count;
 } zdb_blkstats_t;
 
-#define        DMU_OT_DEFERRED DMU_OT_NONE
-#define        DMU_OT_TOTAL    DMU_OT_NUMTYPES
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define        ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
+#define        ZDB_OT_DITTO    (DMU_OT_NUMTYPES + 1)
+#define        ZDB_OT_TOTAL    (DMU_OT_NUMTYPES + 2)
+
+static char *zdb_ot_extname[] = {
+       "deferred free",
+       "dedup ditto",
+       "Total",
+};
 
 #define        ZB_TOTAL        DN_MAX_LEVELS
 
 typedef struct zdb_cb {
-       zdb_blkstats_t  zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
+       zdb_blkstats_t  zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+       uint64_t        zcb_dedup_asize;
+       uint64_t        zcb_dedup_blocks;
        uint64_t        zcb_errors[256];
        int             zcb_readfails;
        int             zcb_haderrors;
+       spa_t           *zcb_spa;
 } zdb_cb_t;
 
 static void
-zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+    dmu_object_type_t type)
 {
+       uint64_t refcnt = 0;
+
+       ASSERT(type < ZDB_OT_TOTAL);
+
+       if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+               return;
+
        for (int i = 0; i < 4; i++) {
                int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
-               int t = (i & 1) ? type : DMU_OT_TOTAL;
+               int t = (i & 1) ? type : ZDB_OT_TOTAL;
                zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
                zb->zb_asize += BP_GET_ASIZE(bp);
@@ -1517,127 +1918,258 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
                zb->zb_count++;
        }
 
-       if (dump_opt['S']) {
-               boolean_t print_sig;
-
-               print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
-                   BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS);
-
-               if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg)
-                       print_sig = B_FALSE;
-
-               if (print_sig) {
-                       (void) printf("%llu\t%lld\t%lld\t%s\t%s\t%s\t"
-                           "%llx:%llx:%llx:%llx\n",
-                           (u_longlong_t)BP_GET_LEVEL(bp),
-                           (longlong_t)BP_GET_PSIZE(bp),
-                           (longlong_t)BP_GET_NDVAS(bp),
-                           dmu_ot[BP_GET_TYPE(bp)].ot_name,
-                           zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-                           zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
-                           (u_longlong_t)bp->blk_cksum.zc_word[0],
-                           (u_longlong_t)bp->blk_cksum.zc_word[1],
-                           (u_longlong_t)bp->blk_cksum.zc_word[2],
-                           (u_longlong_t)bp->blk_cksum.zc_word[3]);
+       if (dump_opt['L'])
+               return;
+
+       if (BP_GET_DEDUP(bp)) {
+               ddt_t *ddt;
+               ddt_entry_t *dde;
+
+               ddt = ddt_select(zcb->zcb_spa, bp);
+               ddt_enter(ddt);
+               dde = ddt_lookup(ddt, bp, B_FALSE);
+
+               if (dde == NULL) {
+                       refcnt = 0;
+               } else {
+                       ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+                       ddt_phys_decref(ddp);
+                       refcnt = ddp->ddp_refcnt;
+                       if (ddt_phys_total_refcnt(dde) == 0)
+                               ddt_remove(ddt, dde);
                }
+               ddt_exit(ddt);
        }
 
-       if (!dump_opt['L'])
-               VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
-                   NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
+       VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+           refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+           bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
+/* ARGSUSED */
 static int
-zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
        zdb_cb_t *zcb = arg;
        char blkbuf[BP_SPRINTF_LEN];
        dmu_object_type_t type;
-       boolean_t is_l0_metadata;
+       boolean_t is_metadata;
 
        if (bp == NULL)
                return (0);
 
        type = BP_GET_TYPE(bp);
 
-       zdb_count_block(spa, zcb, bp, type);
+       zdb_count_block(zcb, zilog, bp, type);
 
-       /*
-        * if we do metadata-only checksumming there's no need to checksum
-        * indirect blocks here because it is done during traverse
-        */
-       is_l0_metadata = (BP_GET_LEVEL(bp) == 0 && type < DMU_OT_NUMTYPES &&
-           dmu_ot[type].ot_metadata);
+       is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
+
+       if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+               int ioerr;
+               size_t size = BP_GET_PSIZE(bp);
+               void *data = malloc(size);
+               int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
-       if (dump_opt['c'] > 1 || dump_opt['S'] ||
-           (dump_opt['c'] && is_l0_metadata)) {
-               int ioerr, size;
-               void *data;
+               /* If it's an intent log block, failure is expected. */
+               if (zb->zb_level == ZB_ZIL_LEVEL)
+                       flags |= ZIO_FLAG_SPECULATIVE;
 
-               size = BP_GET_LSIZE(bp);
-               data = malloc(size);
                ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
-                   NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-                   ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb));
+                   NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+
                free(data);
 
-               /* We expect io errors on intent log */
-               if (ioerr && type != DMU_OT_INTENT_LOG) {
+               if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) {
                        zcb->zcb_haderrors = 1;
                        zcb->zcb_errors[ioerr]++;
 
                        if (dump_opt['b'] >= 2)
-                               sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+                               sprintf_blkptr(blkbuf, bp);
                        else
                                blkbuf[0] = '\0';
 
-                       if (!dump_opt['S']) {
-                               (void) printf("zdb_blkptr_cb: "
-                                   "Got error %d reading "
-                                   "<%llu, %llu, %lld, %llx> %s -- skipping\n",
-                                   ioerr,
-                                   (u_longlong_t)zb->zb_objset,
-                                   (u_longlong_t)zb->zb_object,
-                                   (u_longlong_t)zb->zb_level,
-                                   (u_longlong_t)zb->zb_blkid,
-                                   blkbuf);
-                       }
+                       (void) printf("zdb_blkptr_cb: "
+                           "Got error %d reading "
+                           "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+                           ioerr,
+                           (u_longlong_t)zb->zb_objset,
+                           (u_longlong_t)zb->zb_object,
+                           (u_longlong_t)zb->zb_level,
+                           (u_longlong_t)zb->zb_blkid,
+                           blkbuf);
                }
        }
 
        zcb->zcb_readfails = 0;
 
        if (dump_opt['b'] >= 4) {
-               sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
-               (void) printf("objset %llu object %llu offset 0x%llx %s\n",
+               sprintf_blkptr(blkbuf, bp);
+               (void) printf("objset %llu object %llu "
+                   "level %lld offset 0x%llx %s\n",
                    (u_longlong_t)zb->zb_objset,
                    (u_longlong_t)zb->zb_object,
-                   (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid),
+                   (longlong_t)zb->zb_level,
+                   (u_longlong_t)blkid2offset(dnp, bp, zb),
                    blkbuf);
        }
 
        return (0);
 }
 
+static void
+zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+{
+       vdev_t *vd = sm->sm_ppd;
+
+       (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+           (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+/* ARGSUSED */
+static void
+zdb_space_map_load(space_map_t *sm)
+{
+}
+
+static void
+zdb_space_map_unload(space_map_t *sm)
+{
+       space_map_vacate(sm, zdb_leak, sm);
+}
+
+/* ARGSUSED */
+static void
+zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+}
+
+static space_map_ops_t zdb_space_map_ops = {
+       zdb_space_map_load,
+       zdb_space_map_unload,
+       NULL,   /* alloc */
+       zdb_space_map_claim,
+       NULL,   /* free */
+       NULL    /* maxsize */
+};
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+       ddt_bookmark_t ddb = { 0 };
+       ddt_entry_t dde;
+       int error;
+
+       while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+               blkptr_t blk;
+               ddt_phys_t *ddp = dde.dde_phys;
+
+               if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+                       return;
+
+               ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+               for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+                       if (ddp->ddp_phys_birth == 0)
+                               continue;
+                       ddt_bp_create(ddb.ddb_checksum,
+                           &dde.dde_key, ddp, &blk);
+                       if (p == DDT_PHYS_DITTO) {
+                               zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+                       } else {
+                               zcb->zcb_dedup_asize +=
+                                   BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+                               zcb->zcb_dedup_blocks++;
+                       }
+               }
+               if (!dump_opt['L']) {
+                       ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+                       ddt_enter(ddt);
+                       VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+                       ddt_exit(ddt);
+               }
+       }
+
+       ASSERT(error == ENOENT);
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+       zcb->zcb_spa = spa;
+
+       if (!dump_opt['L']) {
+               vdev_t *rvd = spa->spa_root_vdev;
+               for (int c = 0; c < rvd->vdev_children; c++) {
+                       vdev_t *vd = rvd->vdev_child[c];
+                       for (int m = 0; m < vd->vdev_ms_count; m++) {
+                               metaslab_t *msp = vd->vdev_ms[m];
+                               mutex_enter(&msp->ms_lock);
+                               space_map_unload(&msp->ms_map);
+                               VERIFY(space_map_load(&msp->ms_map,
+                                   &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+                                   spa->spa_meta_objset) == 0);
+                               msp->ms_map.sm_ppd = vd;
+                               mutex_exit(&msp->ms_lock);
+                       }
+               }
+       }
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+       zdb_ddt_leak_init(spa, zcb);
+
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+zdb_leak_fini(spa_t *spa)
+{
+       if (!dump_opt['L']) {
+               vdev_t *rvd = spa->spa_root_vdev;
+               for (int c = 0; c < rvd->vdev_children; c++) {
+                       vdev_t *vd = rvd->vdev_child[c];
+                       for (int m = 0; m < vd->vdev_ms_count; m++) {
+                               metaslab_t *msp = vd->vdev_ms[m];
+                               mutex_enter(&msp->ms_lock);
+                               space_map_unload(&msp->ms_map);
+                               mutex_exit(&msp->ms_lock);
+                       }
+               }
+       }
+}
+
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       zdb_cb_t *zcb = arg;
+
+       if (dump_opt['b'] >= 4) {
+               char blkbuf[BP_SPRINTF_LEN];
+               sprintf_blkptr(blkbuf, bp);
+               (void) printf("[%s] %s\n",
+                   "deferred free", blkbuf);
+       }
+       zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+       return (0);
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
        zdb_cb_t zcb = { 0 };
        zdb_blkstats_t *zb, *tzb;
-       uint64_t alloc, space, logalloc;
-       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t norm_alloc, norm_space, total_alloc, total_found;
+       int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
        int leaks = 0;
-       int c, e;
 
-       if (!dump_opt['S']) {
-               (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
-                   (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
-                   (dump_opt['c'] == 1) ? "metadata " : "",
-                   dump_opt['c'] ? "checksums " : "",
-                   (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
-                   !dump_opt['L'] ? "nothing leaked " : "");
-       }
+       (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+           (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+           (dump_opt['c'] == 1) ? "metadata " : "",
+           dump_opt['c'] ? "checksums " : "",
+           (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+           !dump_opt['L'] ? "nothing leaked " : "");
 
        /*
         * Load all space maps as SM_ALLOC maps, then traverse the pool
@@ -1647,39 +2179,25 @@ dump_block_stats(spa_t *spa)
         * it's not part of any space map) is a double allocation,
         * reference to a freed block, or an unclaimed log block.
         */
-       if (!dump_opt['L'])
-               zdb_leak_init(spa);
+       zdb_leak_init(spa, &zcb);
 
        /*
         * If there's a deferred-free bplist, process that first.
         */
-       if (spa->spa_sync_bplist_obj != 0) {
-               bplist_t *bpl = &spa->spa_sync_bplist;
-               blkptr_t blk;
-               uint64_t itor = 0;
+       (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+           count_block_cb, &zcb, NULL);
+       (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+           count_block_cb, &zcb, NULL);
 
-               VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
-                   spa->spa_sync_bplist_obj));
+       if (dump_opt['c'] > 1)
+               flags |= TRAVERSE_PREFETCH_DATA;
 
-               while (bplist_iterate(bpl, &itor, &blk) == 0) {
-                       if (dump_opt['b'] >= 4) {
-                               char blkbuf[BP_SPRINTF_LEN];
-                               sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
-                               (void) printf("[%s] %s\n",
-                                   "deferred free", blkbuf);
-                       }
-                       zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
-               }
+       zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
-               bplist_close(bpl);
-       }
-
-       zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
-
-       if (zcb.zcb_haderrors && !dump_opt['S']) {
+       if (zcb.zcb_haderrors) {
                (void) printf("\nError counts:\n\n");
                (void) printf("\t%5s  %s\n", "errno", "count");
-               for (e = 0; e < 256; e++) {
+               for (int e = 0; e < 256; e++) {
                        if (zcb.zcb_errors[e] != 0) {
                                (void) printf("\t%5d  %llu\n",
                                    e, (u_longlong_t)zcb.zcb_errors[e]);
@@ -1690,43 +2208,27 @@ dump_block_stats(spa_t *spa)
        /*
         * Report any leaked segments.
         */
-       if (!dump_opt['L'])
-               zdb_leak_fini(spa);
+       zdb_leak_fini(spa);
 
-       /*
-        * If we're interested in printing out the blkptr signatures,
-        * return now as we don't print out anything else (including
-        * errors and leaks).
-        */
-       if (dump_opt['S'])
-               return (zcb.zcb_haderrors ? 3 : 0);
+       tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
-       alloc = spa_get_alloc(spa);
-       space = spa_get_space(spa);
+       norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+       norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
-       /*
-        * Log blocks allocated from a separate log device don't count
-        * as part of the normal pool space; factor them in here.
-        */
-       logalloc = 0;
+       total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+       total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
 
-       for (c = 0; c < rvd->vdev_children; c++)
-               if (rvd->vdev_child[c]->vdev_islog)
-                       logalloc += rvd->vdev_child[c]->vdev_stat.vs_alloc;
-
-       tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
-
-       if (tzb->zb_asize == alloc + logalloc) {
+       if (total_found == total_alloc) {
                if (!dump_opt['L'])
                        (void) printf("\n\tNo leaks (block sum matches space"
                            " maps exactly)\n");
        } else {
                (void) printf("block traversal size %llu != alloc %llu "
                    "(%s %lld)\n",
-                   (u_longlong_t)tzb->zb_asize,
-                   (u_longlong_t)alloc + logalloc,
+                   (u_longlong_t)total_found,
+                   (u_longlong_t)total_alloc,
                    (dump_opt['L']) ? "unreachable" : "leaked",
-                   (longlong_t)(alloc + logalloc - tzb->zb_asize));
+                   (longlong_t)(total_alloc - total_found));
                leaks = 1;
        }
 
@@ -1736,33 +2238,41 @@ dump_block_stats(spa_t *spa)
        (void) printf("\n");
        (void) printf("\tbp count:      %10llu\n",
            (u_longlong_t)tzb->zb_count);
-       (void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
+       (void) printf("\tbp logical:    %10llu      avg: %6llu\n",
            (u_longlong_t)tzb->zb_lsize,
            (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
-       (void) printf("\tbp physical:   %10llu\t avg:"
-           " %6llu\tcompression: %6.2f\n",
+       (void) printf("\tbp physical:   %10llu      avg:"
+           " %6llu     compression: %6.2f\n",
            (u_longlong_t)tzb->zb_psize,
            (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
            (double)tzb->zb_lsize / tzb->zb_psize);
-       (void) printf("\tbp allocated:  %10llu\t avg:"
-           " %6llu\tcompression: %6.2f\n",
+       (void) printf("\tbp allocated:  %10llu      avg:"
+           " %6llu     compression: %6.2f\n",
            (u_longlong_t)tzb->zb_asize,
            (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
            (double)tzb->zb_lsize / tzb->zb_asize);
-       (void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
-           (u_longlong_t)alloc, 100.0 * alloc / space);
+       (void) printf("\tbp deduped:    %10llu    ref>1:"
+           " %6llu   deduplication: %6.2f\n",
+           (u_longlong_t)zcb.zcb_dedup_asize,
+           (u_longlong_t)zcb.zcb_dedup_blocks,
+           (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+       (void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
+           (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
        if (dump_opt['b'] >= 2) {
                int l, t, level;
                (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
                    "\t  avg\t comp\t%%Total\tType\n");
 
-               for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
-                       char csize[6], lsize[6], psize[6], asize[6], avg[6];
+               for (t = 0; t <= ZDB_OT_TOTAL; t++) {
+                       char csize[32], lsize[32], psize[32], asize[32];
+                       char avg[32];
                        char *typename;
 
-                       typename = t == DMU_OT_DEFERRED ? "deferred free" :
-                           t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+                       if (t < DMU_OT_NUMTYPES)
+                               typename = dmu_ot[t].ot_name;
+                       else
+                               typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
                        if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
                                (void) printf("%6s\t%5s\t%5s\t%5s"
@@ -1792,11 +2302,11 @@ dump_block_stats(spa_t *spa)
                                    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
                                        continue;
 
-                               nicenum(zb->zb_count, csize);
-                               nicenum(zb->zb_lsize, lsize);
-                               nicenum(zb->zb_psize, psize);
-                               nicenum(zb->zb_asize, asize);
-                               nicenum(zb->zb_asize / zb->zb_count, avg);
+                               zdb_nicenum(zb->zb_count, csize);
+                               zdb_nicenum(zb->zb_lsize, lsize);
+                               zdb_nicenum(zb->zb_psize, psize);
+                               zdb_nicenum(zb->zb_asize, asize);
+                               zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
 
                                (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
                                    "\t%5.2f\t%6.2f\t",
@@ -1824,36 +2334,157 @@ dump_block_stats(spa_t *spa)
        return (0);
 }
 
+typedef struct zdb_ddt_entry {
+       ddt_key_t       zdde_key;
+       uint64_t        zdde_ref_blocks;
+       uint64_t        zdde_ref_lsize;
+       uint64_t        zdde_ref_psize;
+       uint64_t        zdde_ref_dsize;
+       avl_node_t      zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+       avl_tree_t *t = arg;
+       avl_index_t where;
+       zdb_ddt_entry_t *zdde, zdde_search;
+
+       if (bp == NULL)
+               return (0);
+
+       if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+               (void) printf("traversing objset %llu, %llu objects, "
+                   "%lu blocks so far\n",
+                   (u_longlong_t)zb->zb_objset,
+                   (u_longlong_t)bp->blk_fill,
+                   avl_numnodes(t));
+       }
+
+       if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+           BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+               return (0);
+
+       ddt_key_fill(&zdde_search.zdde_key, bp);
+
+       zdde = avl_find(t, &zdde_search, &where);
+
+       if (zdde == NULL) {
+               zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+               zdde->zdde_key = zdde_search.zdde_key;
+               avl_insert(t, zdde, where);
+       }
+
+       zdde->zdde_ref_blocks += 1;
+       zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+       zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+       zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+       return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+       avl_tree_t t;
+       void *cookie = NULL;
+       zdb_ddt_entry_t *zdde;
+       ddt_histogram_t ddh_total = { 0 };
+       ddt_stat_t dds_total = { 0 };
+
+       avl_create(&t, ddt_entry_compare,
+           sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+       (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+           zdb_ddt_add_cb, &t);
+
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+       while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+               ddt_stat_t dds;
+               uint64_t refcnt = zdde->zdde_ref_blocks;
+               ASSERT(refcnt != 0);
+
+               dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+               dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+               dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+               dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+               dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+               dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+               dds.dds_ref_psize = zdde->zdde_ref_psize;
+               dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+               ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
+
+               umem_free(zdde, sizeof (*zdde));
+       }
+
+       avl_destroy(&t);
+
+       ddt_histogram_stat(&dds_total, &ddh_total);
+
+       (void) printf("Simulated DDT histogram:\n");
+
+       zpool_dump_ddt(&dds_total, &ddh_total);
+
+       dump_dedup_ratio(&dds_total);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
        dsl_pool_t *dp = spa_get_dsl(spa);
        int rc = 0;
 
+       if (dump_opt['S']) {
+               dump_simulated_ddt(spa);
+               return;
+       }
+
+       if (!dump_opt['e'] && dump_opt['C'] > 1) {
+               (void) printf("\nCached configuration:\n");
+               dump_nvlist(spa->spa_config, 8);
+       }
+
+       if (dump_opt['C'])
+               dump_config(spa);
+
        if (dump_opt['u'])
-               dump_uberblock(&spa->spa_uberblock);
+               dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
-       if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) {
+       if (dump_opt['D'])
+               dump_all_ddts(spa);
+
+       if (dump_opt['d'] > 2 || dump_opt['m'])
+               dump_metaslabs(spa);
+
+       if (dump_opt['d'] || dump_opt['i']) {
                dump_dir(dp->dp_meta_objset);
                if (dump_opt['d'] >= 3) {
-                       dump_bplist(dp->dp_meta_objset,
-                           spa->spa_sync_bplist_obj, "Deferred frees");
+                       dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees");
+                       if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+                               dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
+                                   "Pool frees");
+                       }
                        dump_dtl(spa->spa_root_vdev, 0);
                }
-
-               if (dump_opt['d'] >= 3 || dump_opt['m'])
-                       dump_metaslabs(spa);
-
-               (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
-                   DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+               (void) dmu_objset_find(spa_name(spa), dump_one_dir,
+                   NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
        }
-
-       if (dump_opt['b'] || dump_opt['c'] || dump_opt['S'])
+       if (dump_opt['b'] || dump_opt['c'])
                rc = dump_block_stats(spa);
 
        if (dump_opt['s'])
                show_pool_stats(spa);
 
+       if (dump_opt['h'])
+               dump_history(spa);
+
        if (rc != 0)
                exit(rc);
 }
@@ -1872,51 +2503,13 @@ int flagbits[256];
 static void
 zdb_print_blkptr(blkptr_t *bp, int flags)
 {
-       dva_t *dva = bp->blk_dva;
-       int d;
+       char blkbuf[BP_SPRINTF_LEN];
 
        if (flags & ZDB_FLAG_BSWAP)
                byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
-       /*
-        * Super-ick warning:  This code is also duplicated in
-        * cmd/mdb/common/modules/zfs/zfs.c .  Yeah, I hate code
-        * replication, too.
-        */
-       for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-               (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
-                   (longlong_t)DVA_GET_VDEV(&dva[d]),
-                   (longlong_t)DVA_GET_OFFSET(&dva[d]));
-               (void) printf("\tDVA[%d]:       GANG: %-5s  GRID:  %04llx\t"
-                   "ASIZE: %llx\n", d,
-                   DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
-                   (longlong_t)DVA_GET_GRID(&dva[d]),
-                   (longlong_t)DVA_GET_ASIZE(&dva[d]));
-               (void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
-                   (u_longlong_t)DVA_GET_VDEV(&dva[d]),
-                   (longlong_t)DVA_GET_OFFSET(&dva[d]),
-                   (longlong_t)BP_GET_PSIZE(bp),
-                   BP_SHOULD_BYTESWAP(bp) ? "e" : "",
-                   !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
-                   "d" : "",
-                   DVA_GET_GANG(&dva[d]) ? "g" : "",
-                   BP_GET_COMPRESS(bp) != 0 ? "d" : "");
-       }
-       (void) printf("\tLSIZE:  %-16llx\t\tPSIZE: %llx\n",
-           (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp));
-       (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
-           BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
-           dmu_ot[BP_GET_TYPE(bp)].ot_name);
-       (void) printf("\tBIRTH:  %-16llx   LEVEL: %-2llu\tFILL:  %llx\n",
-           (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp),
-           (u_longlong_t)bp->blk_fill);
-       (void) printf("\tCKFUNC: %-16s\t\tCOMP:  %s\n",
-           zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-           zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
-       (void) printf("\tCKSUM:  %llx:%llx:%llx:%llx\n",
-           (u_longlong_t)bp->blk_cksum.zc_word[0],
-           (u_longlong_t)bp->blk_cksum.zc_word[1],
-           (u_longlong_t)bp->blk_cksum.zc_word[2],
-           (u_longlong_t)bp->blk_cksum.zc_word[3]);
+
+       sprintf_blkptr(blkbuf, bp);
+       (void) printf("%s\n", blkbuf);
 }
 
 static void
@@ -1939,7 +2532,7 @@ zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
        if (flags & ZDB_FLAG_BSWAP)
                byteswap_uint64_array(buf, size);
-       (void) write(2, buf, size);
+       (void) write(1, buf, size);
 }
 
 static void
@@ -2042,31 +2635,30 @@ name:
  *     flags          - A string of characters specifying options
  *              b: Decode a blkptr at given offset within block
  *             *c: Calculate and display checksums
- *             *d: Decompress data before dumping
+ *              d: Decompress data before dumping
  *              e: Byteswap data before dumping
- *             *g: Display data as a gang block header
- *             *i: Display as an indirect block
+ *              g: Display data as a gang block header
+ *              i: Display as an indirect block
  *              p: Do I/O to physical offset
  *              r: Dump raw data to stdout
  *
  *              * = not yet implemented
  */
 static void
-zdb_read_block(char *thing, spa_t **spap)
+zdb_read_block(char *thing, spa_t *spa)
 {
-       spa_t *spa = *spap;
+       blkptr_t blk, *bp = &blk;
+       dva_t *dva = bp->blk_dva;
        int flags = 0;
-       uint64_t offset = 0, size = 0, blkptr_offset = 0;
+       uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
        zio_t *zio;
        vdev_t *vd;
-       void *buf;
-       char *s, *p, *dup, *pool, *vdev, *flagstr;
-       int i, error, zio_flags;
+       void *pbuf, *lbuf, *buf;
+       char *s, *p, *dup, *vdev, *flagstr;
+       int i, error;
 
        dup = strdup(thing);
        s = strtok(dup, ":");
-       pool = s ? s : "";
-       s = strtok(NULL, ":");
        vdev = s ? s : "";
        s = strtok(NULL, ":");
        offset = strtoull(s ? s : "", NULL, 16);
@@ -2100,7 +2692,7 @@ zdb_read_block(char *thing, spa_t **spap)
                        flags |= bit;
 
                        /* If it's not something with an argument, keep going */
-                       if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+                       if ((bit & (ZDB_FLAG_CHECKSUM |
                            ZDB_FLAG_PRINT_BLKPTR)) == 0)
                                continue;
 
@@ -2115,16 +2707,6 @@ zdb_read_block(char *thing, spa_t **spap)
                }
        }
 
-       if (spa == NULL || strcmp(spa_name(spa), pool) != 0) {
-               if (spa)
-                       spa_close(spa, (void *)zdb_read_block);
-               error = spa_open(pool, spap, (void *)zdb_read_block);
-               if (error)
-                       fatal("Failed to open pool '%s': %s",
-                           pool, strerror(error));
-               spa = *spap;
-       }
-
        vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
        if (vd == NULL) {
                (void) printf("***Invalid vdev: %s\n", vdev);
@@ -2132,22 +2714,58 @@ zdb_read_block(char *thing, spa_t **spap)
                return;
        } else {
                if (vd->vdev_path)
-                       (void) printf("Found vdev: %s\n", vd->vdev_path);
+                       (void) fprintf(stderr, "Found vdev: %s\n",
+                           vd->vdev_path);
                else
-                       (void) printf("Found vdev type: %s\n",
+                       (void) fprintf(stderr, "Found vdev type: %s\n",
                            vd->vdev_ops->vdev_op_type);
        }
 
-       buf = umem_alloc(size, UMEM_NOFAIL);
+       psize = size;
+       lsize = size;
+
+       pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+       lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+       BP_ZERO(bp);
 
-       zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
-           ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY;
+       DVA_SET_VDEV(&dva[0], vd->vdev_id);
+       DVA_SET_OFFSET(&dva[0], offset);
+       DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+       DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+       BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+       BP_SET_LSIZE(bp, lsize);
+       BP_SET_PSIZE(bp, psize);
+       BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+       BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+       BP_SET_TYPE(bp, DMU_OT_NONE);
+       BP_SET_LEVEL(bp, 0);
+       BP_SET_DEDUP(bp, 0);
+       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
        spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
        zio = zio_root(spa, NULL, NULL, 0);
-       /* XXX todo - cons up a BP so RAID-Z will be happy */
-       zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
-           ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+
+       if (vd == vd->vdev_top) {
+               /*
+                * Treat this as a normal block read.
+                */
+               zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+                   ZIO_PRIORITY_SYNC_READ,
+                   ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+       } else {
+               /*
+                * Treat this as a vdev child I/O.
+                */
+               zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
+                   ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+                   ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+                   ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+                   ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+       }
+
        error = zio_wait(zio);
        spa_config_exit(spa, SCL_STATE, FTAG);
 
@@ -2156,6 +2774,52 @@ zdb_read_block(char *thing, spa_t **spap)
                goto out;
        }
 
+       if (flags & ZDB_FLAG_DECOMPRESS) {
+               /*
+                * We don't know how the data was compressed, so just try
+                * every decompress function at every inflated blocksize.
+                */
+               enum zio_compress c;
+               void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+               void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+               bcopy(pbuf, pbuf2, psize);
+
+               VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
+                   SPA_MAXBLOCKSIZE - psize) == 0);
+
+               VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+                   SPA_MAXBLOCKSIZE - psize) == 0);
+
+               for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
+                   lsize -= SPA_MINBLOCKSIZE) {
+                       for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
+                               if (zio_decompress_data(c, pbuf, lbuf,
+                                   psize, lsize) == 0 &&
+                                   zio_decompress_data(c, pbuf2, lbuf2,
+                                   psize, lsize) == 0 &&
+                                   bcmp(lbuf, lbuf2, lsize) == 0)
+                                       break;
+                       }
+                       if (c != ZIO_COMPRESS_FUNCTIONS)
+                               break;
+                       lsize -= SPA_MINBLOCKSIZE;
+               }
+
+               umem_free(pbuf2, SPA_MAXBLOCKSIZE);
+               umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+               if (lsize <= psize) {
+                       (void) printf("Decompress of %s failed\n", thing);
+                       goto out;
+               }
+               buf = lbuf;
+               size = lsize;
+       } else {
+               buf = pbuf;
+               size = psize;
+       }
+
        if (flags & ZDB_FLAG_PRINT_BLKPTR)
                zdb_print_blkptr((blkptr_t *)(void *)
                    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
@@ -2170,134 +2834,92 @@ zdb_read_block(char *thing, spa_t **spap)
                zdb_dump_block(thing, buf, size, flags);
 
 out:
-       umem_free(buf, size);
+       umem_free(pbuf, SPA_MAXBLOCKSIZE);
+       umem_free(lbuf, SPA_MAXBLOCKSIZE);
        free(dup);
 }
 
 static boolean_t
-nvlist_string_match(nvlist_t *config, char *name, char *tgt)
+pool_match(nvlist_t *cfg, char *tgt)
 {
+       uint64_t v, guid = strtoull(tgt, NULL, 0);
        char *s;
 
-       if (nvlist_lookup_string(config, name, &s) != 0)
-               return (B_FALSE);
-
-       return (strcmp(s, tgt) == 0);
-}
-
-static boolean_t
-nvlist_uint64_match(nvlist_t *config, char *name, uint64_t tgt)
-{
-       uint64_t val;
-
-       if (nvlist_lookup_uint64(config, name, &val) != 0)
-               return (B_FALSE);
-
-       return (val == tgt);
-}
-
-static boolean_t
-vdev_child_guid_match(nvlist_t *vdev, uint64_t guid)
-{
-       nvlist_t **child;
-       uint_t c, children;
-
-       verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN,
-           &child, &children) == 0);
-       for (c = 0; c < children; ++c)
-               if (nvlist_uint64_match(child[c], ZPOOL_CONFIG_GUID, guid))
-                       return (B_TRUE);
-       return (B_FALSE);
-}
-
-static boolean_t
-vdev_child_string_match(nvlist_t *vdev, char *tgt)
-{
-       nvlist_t **child;
-       uint_t c, children;
-
-       verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN,
-           &child, &children) == 0);
-       for (c = 0; c < children; ++c) {
-               if (nvlist_string_match(child[c], ZPOOL_CONFIG_PATH, tgt) ||
-                   nvlist_string_match(child[c], ZPOOL_CONFIG_DEVID, tgt))
-                       return (B_TRUE);
-       }
-       return (B_FALSE);
-}
-
-static boolean_t
-vdev_guid_match(nvlist_t *config, uint64_t guid)
-{
-       nvlist_t *nvroot;
-
-       verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-           &nvroot) == 0);
-
-       return (nvlist_uint64_match(nvroot, ZPOOL_CONFIG_GUID, guid) ||
-           vdev_child_guid_match(nvroot, guid));
-}
-
-static boolean_t
-vdev_string_match(nvlist_t *config, char *tgt)
-{
-       nvlist_t *nvroot;
-
-       verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-           &nvroot) == 0);
-
-       return (vdev_child_string_match(nvroot, tgt));
-}
-
-static boolean_t
-pool_match(nvlist_t *config, char *tgt)
-{
-       uint64_t guid = strtoull(tgt, NULL, 0);
-
        if (guid != 0) {
-               return (
-                   nvlist_uint64_match(config, ZPOOL_CONFIG_POOL_GUID, guid) ||
-                   vdev_guid_match(config, guid));
+               if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+                       return (v == guid);
        } else {
-               return (
-                   nvlist_string_match(config, ZPOOL_CONFIG_POOL_NAME, tgt) ||
-                   vdev_string_match(config, tgt));
+               if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+                       return (strcmp(s, tgt) == 0);
        }
+       return (B_FALSE);
 }
 
-static int
-find_exported_zpool(char *pool_id, nvlist_t **configp, char *vdev_dir)
+static char *
+find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
 {
        nvlist_t *pools;
-       int error = ENOENT;
        nvlist_t *match = NULL;
+       char *name = NULL;
+       char *sepp = NULL;
+       char sep;
+       int count = 0;
+       importargs_t args = { 0 };
 
-       if (vdev_dir != NULL)
-               pools = zpool_find_import_activeok(g_zfs, 1, &vdev_dir);
-       else
-               pools = zpool_find_import_activeok(g_zfs, 0, NULL);
+       args.paths = dirc;
+       args.path = dirv;
+       args.can_be_active = B_TRUE;
+
+       if ((sepp = strpbrk(*target, "/@")) != NULL) {
+               sep = *sepp;
+               *sepp = '\0';
+       }
+
+       pools = zpool_search_import(g_zfs, &args);
 
        if (pools != NULL) {
                nvpair_t *elem = NULL;
-
                while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
                        verify(nvpair_value_nvlist(elem, configp) == 0);
-                       if (pool_match(*configp, pool_id)) {
+                       if (pool_match(*configp, *target)) {
+                               count++;
                                if (match != NULL) {
-                                       (void) fatal(
-                                           "More than one matching pool - "
-                                           "specify guid/devid/device path.");
+                                       /* print previously found config */
+                                       if (name != NULL) {
+                                               (void) printf("%s\n", name);
+                                               dump_nvlist(match, 8);
+                                               name = NULL;
+                                       }
+                                       (void) printf("%s\n",
+                                           nvpair_name(elem));
+                                       dump_nvlist(*configp, 8);
                                } else {
                                        match = *configp;
-                                       error = 0;
+                                       name = nvpair_name(elem);
                                }
                        }
                }
        }
+       if (count > 1)
+               (void) fatal("\tMatched %d pools - use pool GUID "
+                   "instead of pool name or \n"
+                   "\tpool name part of a dataset name to select pool", count);
+
+       if (sepp)
+               *sepp = sep;
+       /*
+        * If pool GUID was specified for pool id, replace it with pool name
+        */
+       if (name && (strstr(*target, name) != *target)) {
+               int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
+
+               *target = umem_alloc(sz, UMEM_NOFAIL);
+               (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
+       }
 
-       *configp = error ? NULL : match;
+       *configp = name ? match : NULL;
 
-       return (error);
+       return (name);
 }
 
 int
@@ -2305,83 +2927,85 @@ main(int argc, char **argv)
 {
        int i, c;
        struct rlimit rl = { 1024, 1024 };
-       spa_t *spa;
+       spa_t *spa = NULL;
        objset_t *os = NULL;
-       char *endstr;
        int dump_all = 1;
        int verbose = 0;
-       int error;
-       int exported = 0;
-       char *vdev_dir = NULL;
+       int error = 0;
+       char **searchdirs = NULL;
+       int nsearch = 0;
+       char *target;
+       nvlist_t *policy = NULL;
+       uint64_t max_txg = UINT64_MAX;
+       int rewind = ZPOOL_NEVER_REWIND;
 
        (void) setrlimit(RLIMIT_NOFILE, &rl);
        (void) enable_extended_FILE_stdio(-1, -1);
 
        dprintf_setup(&argc, argv);
 
-       while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
+       while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) {
                switch (c) {
-               case 'u':
-               case 'd':
-               case 'i':
                case 'b':
                case 'c':
+               case 'd':
+               case 'h':
+               case 'i':
+               case 'l':
                case 'm':
                case 's':
+               case 'u':
                case 'C':
-               case 'l':
+               case 'D':
                case 'R':
+               case 'S':
                        dump_opt[c]++;
                        dump_all = 0;
                        break;
+               case 'A':
+               case 'F':
                case 'L':
+               case 'X':
+               case 'e':
+               case 'P':
                        dump_opt[c]++;
                        break;
                case 'v':
                        verbose++;
                        break;
-               case 'U':
-                       spa_config_path = optarg;
-                       break;
-               case 'e':
-                       exported = 1;
-                       break;
                case 'p':
-                       vdev_dir = optarg;
-                       break;
-               case 'S':
-                       dump_opt[c]++;
-                       dump_all = 0;
-                       zdb_sig_user_data = (strncmp(optarg, "user:", 5) == 0);
-                       if (!zdb_sig_user_data && strncmp(optarg, "all:", 4))
-                               usage();
-                       endstr = strchr(optarg, ':') + 1;
-                       if (strcmp(endstr, "fletcher2") == 0)
-                               zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
-                       else if (strcmp(endstr, "fletcher4") == 0)
-                               zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_4;
-                       else if (strcmp(endstr, "sha256") == 0)
-                               zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
-                       else if (strcmp(endstr, "all") == 0)
-                               zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
-                       else
-                               usage();
+                       if (searchdirs == NULL) {
+                               searchdirs = umem_alloc(sizeof (char *),
+                                   UMEM_NOFAIL);
+                       } else {
+                               char **tmp = umem_alloc((nsearch + 1) *
+                                   sizeof (char *), UMEM_NOFAIL);
+                               bcopy(searchdirs, tmp, nsearch *
+                                   sizeof (char *));
+                               umem_free(searchdirs,
+                                   nsearch * sizeof (char *));
+                               searchdirs = tmp;
+                       }
+                       searchdirs[nsearch++] = optarg;
                        break;
                case 't':
-                       ub_max_txg = strtoull(optarg, NULL, 0);
-                       if (ub_max_txg < TXG_INITIAL) {
+                       max_txg = strtoull(optarg, NULL, 0);
+                       if (max_txg < TXG_INITIAL) {
                                (void) fprintf(stderr, "incorrect txg "
                                    "specified: %s\n", optarg);
                                usage();
                        }
                        break;
+               case 'U':
+                       spa_config_path = optarg;
+                       break;
                default:
                        usage();
                        break;
                }
        }
 
-       if (vdev_dir != NULL && exported == 0) {
+       if (!dump_opt['e'] && searchdirs != NULL) {
                (void) fprintf(stderr, "-p option requires use of -e\n");
                usage();
        }
@@ -2390,18 +3014,26 @@ main(int argc, char **argv)
        g_zfs = libzfs_init();
        ASSERT(g_zfs != NULL);
 
+       if (dump_all)
+               verbose = MAX(verbose, 1);
+
        for (c = 0; c < 256; c++) {
-               if (dump_all && c != 'l' && c != 'R')
+               if (dump_all && !strchr("elAFLRSXP", c))
                        dump_opt[c] = 1;
                if (dump_opt[c])
                        dump_opt[c] += verbose;
        }
 
+       aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+       zfs_recover = (dump_opt['A'] > 1);
+
        argc -= optind;
        argv += optind;
 
+       if (argc < 2 && dump_opt['R'])
+               usage();
        if (argc < 1) {
-               if (dump_opt['C']) {
+               if (!dump_opt['e'] && dump_opt['C']) {
                        dump_cachefile(spa_config_path);
                        return (0);
                }
@@ -2413,99 +3045,104 @@ main(int argc, char **argv)
                return (0);
        }
 
-       if (dump_opt['R']) {
-               flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
-               flagbits['c'] = ZDB_FLAG_CHECKSUM;
-               flagbits['d'] = ZDB_FLAG_DECOMPRESS;
-               flagbits['e'] = ZDB_FLAG_BSWAP;
-               flagbits['g'] = ZDB_FLAG_GBH;
-               flagbits['i'] = ZDB_FLAG_INDIRECT;
-               flagbits['p'] = ZDB_FLAG_PHYS;
-               flagbits['r'] = ZDB_FLAG_RAW;
-
-               spa = NULL;
-               while (argv[0]) {
-                       zdb_read_block(argv[0], &spa);
-                       argv++;
-                       argc--;
-               }
-               if (spa)
-                       spa_close(spa, (void *)zdb_read_block);
-               return (0);
-       }
+       if (dump_opt['X'] || dump_opt['F'])
+               rewind = ZPOOL_DO_REWIND |
+                   (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
-       if (dump_opt['C'])
-               dump_config(argv[0]);
+       if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+           nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
+           nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
+               fatal("internal error: %s", strerror(ENOMEM));
 
        error = 0;
-       if (exported) {
-               /*
-                * Check to see if the name refers to an exported zpool
-                */
-               char *slash;
-               nvlist_t *exported_conf = NULL;
-
-               if ((slash = strchr(argv[0], '/')) != NULL)
-                       *slash = '\0';
-
-               error = find_exported_zpool(argv[0], &exported_conf, vdev_dir);
-               if (error == 0) {
-                       nvlist_t *nvl = NULL;
-
-                       if (vdev_dir != NULL) {
-                               if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
-                                       error = ENOMEM;
-                               else if (nvlist_add_string(nvl,
-                                   zpool_prop_to_name(ZPOOL_PROP_ALTROOT),
-                                   vdev_dir) != 0)
-                                       error = ENOMEM;
-                       }
+       target = argv[0];
 
-                       if (error == 0)
-                               error = spa_import_verbatim(argv[0],
-                                   exported_conf, nvl);
+       if (dump_opt['e']) {
+               nvlist_t *cfg = NULL;
+               char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
 
-                       nvlist_free(nvl);
+               error = ENOENT;
+               if (name) {
+                       if (dump_opt['C'] > 1) {
+                               (void) printf("\nConfiguration for import:\n");
+                               dump_nvlist(cfg, 8);
+                       }
+                       if (nvlist_add_nvlist(cfg,
+                           ZPOOL_REWIND_POLICY, policy) != 0) {
+                               fatal("can't open '%s': %s",
+                                   target, strerror(ENOMEM));
+                       }
+                       if ((error = spa_import(name, cfg, NULL)) != 0)
+                               error = spa_import_verbatim(name, cfg, NULL);
                }
-
-               if (slash != NULL)
-                       *slash = '/';
        }
 
        if (error == 0) {
-               if (strchr(argv[0], '/') != NULL) {
-                       error = dmu_objset_open(argv[0], DMU_OST_ANY,
-                           DS_MODE_USER | DS_MODE_READONLY, &os);
+               if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
+                       error = spa_open_rewind(target, &spa, FTAG, policy,
+                           NULL);
+                       if (error) {
+                               /*
+                                * If we're missing the log device then
+                                * try opening the pool after clearing the
+                                * log state.
+                                */
+                               mutex_enter(&spa_namespace_lock);
+                               if ((spa = spa_lookup(target)) != NULL &&
+                                   spa->spa_log_state == SPA_LOG_MISSING) {
+                                       spa->spa_log_state = SPA_LOG_CLEAR;
+                                       error = 0;
+                               }
+                               mutex_exit(&spa_namespace_lock);
+
+                               if (!error) {
+                                       error = spa_open_rewind(target, &spa,
+                                           FTAG, policy, NULL);
+                               }
+                       }
                } else {
-                       error = spa_open(argv[0], &spa, FTAG);
+                       error = dmu_objset_own(target, DMU_OST_ANY,
+                           B_TRUE, FTAG, &os);
                }
        }
+       nvlist_free(policy);
 
        if (error)
-               fatal("can't open %s: %s", argv[0], strerror(error));
+               fatal("can't open '%s': %s", target, strerror(error));
 
        argv++;
-       if (--argc > 0) {
-               zopt_objects = argc;
-               zopt_object = calloc(zopt_objects, sizeof (uint64_t));
-               for (i = 0; i < zopt_objects; i++) {
-                       errno = 0;
-                       zopt_object[i] = strtoull(argv[i], NULL, 0);
-                       if (zopt_object[i] == 0 && errno != 0)
-                               fatal("bad object number %s: %s",
-                                   argv[i], strerror(errno));
+       argc--;
+       if (!dump_opt['R']) {
+               if (argc > 0) {
+                       zopt_objects = argc;
+                       zopt_object = calloc(zopt_objects, sizeof (uint64_t));
+                       for (i = 0; i < zopt_objects; i++) {
+                               errno = 0;
+                               zopt_object[i] = strtoull(argv[i], NULL, 0);
+                               if (zopt_object[i] == 0 && errno != 0)
+                                       fatal("bad number %s: %s",
+                                           argv[i], strerror(errno));
+                       }
                }
-       }
-
-       if (os != NULL) {
-               dump_dir(os);
-               dmu_objset_close(os);
+               (os != NULL) ? dump_dir(os) : dump_zpool(spa);
        } else {
-               dump_zpool(spa);
-               spa_close(spa, FTAG);
+               flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+               flagbits['c'] = ZDB_FLAG_CHECKSUM;
+               flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+               flagbits['e'] = ZDB_FLAG_BSWAP;
+               flagbits['g'] = ZDB_FLAG_GBH;
+               flagbits['i'] = ZDB_FLAG_INDIRECT;
+               flagbits['p'] = ZDB_FLAG_PHYS;
+               flagbits['r'] = ZDB_FLAG_RAW;
+
+               for (i = 0; i < argc; i++)
+                       zdb_read_block(argv[i], spa);
        }
 
+       (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
+
        fuid_table_destroy();
+       sa_loaded = B_FALSE;
 
        libzfs_fini(g_zfs);
        kernel_fini();
index cc08ef5..a0ed985 100644 (file)
 
 extern uint8_t dump_opt[256];
 
+static char prefix[4] = "\t\t\t";
+
 static void
 print_log_bp(const blkptr_t *bp, const char *prefix)
 {
        char blkbuf[BP_SPRINTF_LEN];
 
-       sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+       sprintf_blkptr(blkbuf, bp);
        (void) printf("%s%s\n", prefix, blkbuf);
 }
 
@@ -54,19 +56,29 @@ static void
 zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
 {
        time_t crtime = lr->lr_crtime[0];
-       char *name = (char *)(lr + 1);
-       char *link = name + strlen(name) + 1;
+       char *name, *link;
+       lr_attr_t *lrattr;
 
-       if (txtype == TX_SYMLINK)
-               (void) printf("\t\t\t%s -> %s\n", name, link);
-       else
-               (void) printf("\t\t\t%s\n", name);
+       name = (char *)(lr + 1);
+
+       if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+           lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+               lrattr = (lr_attr_t *)(lr + 1);
+               name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+       }
+
+       if (txtype == TX_SYMLINK) {
+               link = name + strlen(name) + 1;
+               (void) printf("%s%s -> %s\n", prefix, name, link);
+       } else if (txtype != TX_MKXATTR) {
+               (void) printf("%s%s\n", prefix, name);
+       }
 
-       (void) printf("\t\t\t%s", ctime(&crtime));
-       (void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+       (void) printf("%s%s", prefix, ctime(&crtime));
+       (void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
            (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
            (longlong_t)lr->lr_mode);
-       (void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+       (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
            (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
            (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
 }
@@ -75,7 +87,7 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
 static void
 zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 {
-       (void) printf("\t\t\tdoid %llu, name %s\n",
+       (void) printf("%sdoid %llu, name %s\n", prefix,
            (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
 }
 
@@ -83,7 +95,7 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 static void
 zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
 {
-       (void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+       (void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
            (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
            (char *)(lr + 1));
 }
@@ -95,9 +107,9 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
        char *snm = (char *)(lr + 1);
        char *tnm = snm + strlen(snm) + 1;
 
-       (void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+       (void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
            (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
-       (void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+       (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
 }
 
 /* ARGSUSED */
@@ -106,43 +118,48 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 {
        char *data, *dlimit;
        blkptr_t *bp = &lr->lr_blkptr;
+       zbookmark_t zb;
        char buf[SPA_MAXBLOCKSIZE];
        int verbose = MAX(dump_opt['d'], dump_opt['i']);
        int error;
 
-       (void) printf("\t\t\tfoid %llu, offset 0x%llx,"
-           " length 0x%llx, blkoff 0x%llx\n",
-           (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
-           (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+       (void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
+           (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+           (u_longlong_t)lr->lr_length);
 
-       if (verbose < 5)
+       if (txtype == TX_WRITE2 || verbose < 5)
                return;
 
        if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
-               (void) printf("\t\t\thas blkptr, %s\n",
+               (void) printf("%shas blkptr, %s\n", prefix,
                    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
                    "will claim" : "won't claim");
-               print_log_bp(bp, "\t\t\t");
+               print_log_bp(bp, prefix);
+
+               if (BP_IS_HOLE(bp)) {
+                       (void) printf("\t\t\tLSIZE 0x%llx\n",
+                           (u_longlong_t)BP_GET_LSIZE(bp));
+               }
                if (bp->blk_birth == 0) {
                        bzero(buf, sizeof (buf));
-               } else {
-                       zbookmark_t zb;
-
-                       ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==,
-                           dmu_objset_id(zilog->zl_os));
-
-                       zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-                       zb.zb_object = 0;
-                       zb.zb_level = -1;
-                       zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-                       error = zio_wait(zio_read(NULL, zilog->zl_spa,
-                           bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
-                           ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
-                       if (error)
-                               return;
+                       (void) printf("%s<hole>\n", prefix);
+                       return;
                }
-               data = buf + lr->lr_blkoff;
+               if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+                       (void) printf("%s<block already committed>\n", prefix);
+                       return;
+               }
+
+               SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+                   lr->lr_foid, ZB_ZIL_LEVEL,
+                   lr->lr_offset / BP_GET_LSIZE(bp));
+
+               error = zio_wait(zio_read(NULL, zilog->zl_spa,
+                   bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+                   ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+               if (error)
+                       return;
+               data = buf;
        } else {
                data = (char *)(lr + 1);
        }
@@ -150,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
        dlimit = data + MIN(lr->lr_length,
            (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
 
-       (void) printf("\t\t\t");
+       (void) printf("%s", prefix);
        while (data < dlimit) {
                if (isprint(*data))
                        (void) printf("%c ", *data);
@@ -165,7 +182,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 static void
 zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
 {
-       (void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+       (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
            (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
            (u_longlong_t)lr->lr_length);
 }
@@ -177,38 +194,38 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
        time_t atime = (time_t)lr->lr_atime[0];
        time_t mtime = (time_t)lr->lr_mtime[0];
 
-       (void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+       (void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
            (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
 
        if (lr->lr_mask & AT_MODE) {
-               (void) printf("\t\t\tAT_MODE  %llo\n",
+               (void) printf("%sAT_MODE  %llo\n", prefix,
                    (longlong_t)lr->lr_mode);
        }
 
        if (lr->lr_mask & AT_UID) {
-               (void) printf("\t\t\tAT_UID   %llu\n",
+               (void) printf("%sAT_UID   %llu\n", prefix,
                    (u_longlong_t)lr->lr_uid);
        }
 
        if (lr->lr_mask & AT_GID) {
-               (void) printf("\t\t\tAT_GID   %llu\n",
+               (void) printf("%sAT_GID   %llu\n", prefix,
                    (u_longlong_t)lr->lr_gid);
        }
 
        if (lr->lr_mask & AT_SIZE) {
-               (void) printf("\t\t\tAT_SIZE  %llu\n",
+               (void) printf("%sAT_SIZE  %llu\n", prefix,
                    (u_longlong_t)lr->lr_size);
        }
 
        if (lr->lr_mask & AT_ATIME) {
-               (void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+               (void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
                    (u_longlong_t)lr->lr_atime[0],
                    (u_longlong_t)lr->lr_atime[1],
                    ctime(&atime));
        }
 
        if (lr->lr_mask & AT_MTIME) {
-               (void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+               (void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
                    (u_longlong_t)lr->lr_mtime[0],
                    (u_longlong_t)lr->lr_mtime[1],
                    ctime(&mtime));
@@ -219,7 +236,7 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
 static void
 zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
 {
-       (void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+       (void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
            (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
 }
 
@@ -251,10 +268,11 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
        {       zil_prt_rec_create,     "TX_MKDIR_ACL       " },
        {       zil_prt_rec_create,     "TX_MKDIR_ATTR      " },
        {       zil_prt_rec_create,     "TX_MKDIR_ACL_ATTR  " },
+       {       zil_prt_rec_write,      "TX_WRITE2          " },
 };
 
 /* ARGSUSED */
-static void
+static int
 print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
 {
        int txtype;
@@ -278,23 +296,24 @@ print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
 
        zil_rec_info[txtype].zri_count++;
        zil_rec_info[0].zri_count++;
+
+       return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-       char blkbuf[BP_SPRINTF_LEN];
+       char blkbuf[BP_SPRINTF_LEN + 10];
        int verbose = MAX(dump_opt['d'], dump_opt['i']);
        char *claim;
 
        if (verbose <= 3)
-               return;
+               return (0);
 
        if (verbose >= 5) {
                (void) strcpy(blkbuf, ", ");
-               sprintf_blkptr(blkbuf + strlen(blkbuf),
-                   BP_SPRINTF_LEN - strlen(blkbuf), bp);
+               sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
        } else {
                blkbuf[0] = '\0';
        }
@@ -308,6 +327,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 
        (void) printf("\tBlock seqno %llu, %s%s\n",
            (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+       return (0);
 }
 
 static void
@@ -340,17 +361,17 @@ dump_intent_log(zilog_t *zilog)
        int verbose = MAX(dump_opt['d'], dump_opt['i']);
        int i;
 
-       if (zh->zh_log.blk_birth == 0 || verbose < 2)
+       if (zh->zh_log.blk_birth == 0 || verbose < 1)
                return;
 
-       (void) printf("\n    ZIL header: claim_txg %llu, claim_seq %llu",
-           (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
+       (void) printf("\n    ZIL header: claim_txg %llu, "
+           "claim_blk_seq %llu, claim_lr_seq %llu",
+           (u_longlong_t)zh->zh_claim_txg,
+           (u_longlong_t)zh->zh_claim_blk_seq,
+           (u_longlong_t)zh->zh_claim_lr_seq);
        (void) printf(" replay_seq %llu, flags 0x%llx\n",
            (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
 
-       if (verbose >= 4)
-               print_log_bp(&zh->zh_log, "\n\tfirst block: ");
-
        for (i = 0; i < TX_MAX_TYPE; i++)
                zil_rec_info[i].zri_count = 0;
 
index 04dd2bd..e2ab90e 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <libintl.h>
@@ -107,7 +106,8 @@ zfs_callback(zfs_handle_t *zhp, void *data)
                                        zfs_prune_proplist(zhp,
                                            cb->cb_props_table);
 
-                               if (zfs_expand_proplist(zhp, cb->cb_proplist)
+                               if (zfs_expand_proplist(zhp, cb->cb_proplist,
+                                   (cb->cb_flags & ZFS_ITER_RECVD_PROPS))
                                    != 0) {
                                        free(node);
                                        return (-1);
@@ -350,11 +350,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
        avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
            offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
 
-       if (avl_pool == NULL) {
-               (void) fprintf(stderr,
-                   gettext("internal error: out of memory\n"));
-               exit(1);
-       }
+       if (avl_pool == NULL)
+               nomem();
 
        cb.cb_sortcol = sortcol;
        cb.cb_flags = flags;
@@ -399,11 +396,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
                    sizeof (cb.cb_props_table));
        }
 
-       if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
-               (void) fprintf(stderr,
-                   gettext("internal error: out of memory\n"));
-               exit(1);
-       }
+       if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+               nomem();
 
        if (argc == 0) {
                /*
@@ -453,11 +447,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
        /*
         * Finally, clean up the AVL tree.
         */
-       if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) {
-               (void) fprintf(stderr,
-                   gettext("internal error: out of memory"));
-               exit(1);
-       }
+       if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+               nomem();
 
        while ((node = uu_avl_walk_next(walk)) != NULL) {
                uu_avl_remove(cb.cb_avl, node);
index a029077..8c6b9fd 100644 (file)
@@ -42,6 +42,7 @@ typedef struct zfs_sort_column {
 #define        ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
 #define        ZFS_ITER_PROP_LISTSNAPS    (1 << 2)
 #define        ZFS_ITER_DEPTH_LIMIT       (1 << 3)
+#define        ZFS_ITER_RECVD_PROPS       (1 << 4)
 
 int zfs_for_each(int, char **, int options, zfs_type_t,
     zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
index 1fbd8bc..353fd4f 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <assert.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include <time.h>
 
 #include <libzfs.h>
 #include <libuutil.h>
 
 #include "zfs_iter.h"
 #include "zfs_util.h"
+#include "zfs_comutil.h"
 
 libzfs_handle_t *g_zfs;
 
@@ -157,7 +159,7 @@ static zfs_command_t command_table[] = {
        { "list",       zfs_do_list,            HELP_LIST               },
        { NULL },
        { "set",        zfs_do_set,             HELP_SET                },
-       { "get",        zfs_do_get,             HELP_GET                },
+       { "get",        zfs_do_get,             HELP_GET                },
        { "inherit",    zfs_do_inherit,         HELP_INHERIT            },
        { "upgrade",    zfs_do_upgrade,         HELP_UPGRADE            },
        { "userspace",  zfs_do_userspace,       HELP_USERSPACE          },
@@ -197,16 +199,15 @@ get_usage(zfs_help_t idx)
                    "\tcreate [-ps] [-b blocksize] [-o property=value] ... "
                    "-V <size> <volume>\n"));
        case HELP_DESTROY:
-               return (gettext("\tdestroy [-rRf] "
-                   "<filesystem|volume|snapshot>\n"
-                   "\tdestroy -d [-r] <filesystem|volume|snapshot>\n"));
+               return (gettext("\tdestroy [-rRf] <filesystem|volume>\n"
+                   "\tdestroy [-rRd] <snapshot>\n"));
        case HELP_GET:
                return (gettext("\tget [-rHp] [-d max] "
-                   "[-o field[,...]] [-s source[,...]]\n"
+                   "[-o \"all\" | field[,...]] [-s source[,...]]\n"
                    "\t    <\"all\" | property[,...]> "
                    "[filesystem|volume|snapshot] ...\n"));
        case HELP_INHERIT:
-               return (gettext("\tinherit [-r] <property> "
+               return (gettext("\tinherit [-rS] <property> "
                    "<filesystem|volume|snapshot> ...\n"));
        case HELP_UPGRADE:
                return (gettext("\tupgrade [-v]\n"
@@ -222,9 +223,9 @@ get_usage(zfs_help_t idx)
        case HELP_PROMOTE:
                return (gettext("\tpromote <clone-filesystem>\n"));
        case HELP_RECEIVE:
-               return (gettext("\treceive [-vnF] <filesystem|volume|"
+               return (gettext("\treceive [-vnFu] <filesystem|volume|"
                "snapshot>\n"
-               "\treceive [-vnF] -d <filesystem>\n"));
+               "\treceive [-vnFu] [-d | -e] <filesystem>\n"));
        case HELP_RENAME:
                return (gettext("\trename <filesystem|volume|snapshot> "
                    "<filesystem|volume|snapshot>\n"
@@ -233,7 +234,7 @@ get_usage(zfs_help_t idx)
        case HELP_ROLLBACK:
                return (gettext("\trollback [-rRf] <snapshot>\n"));
        case HELP_SEND:
-               return (gettext("\tsend [-R] [-[iI] snapshot] <snapshot>\n"));
+               return (gettext("\tsend [-RDp] [-[iI] snapshot] <snapshot>\n"));
        case HELP_SET:
                return (gettext("\tset <property=value> "
                    "<filesystem|volume|snapshot> ...\n"));
@@ -288,22 +289,39 @@ get_usage(zfs_help_t idx)
        /* NOTREACHED */
 }
 
+void
+nomem(void)
+{
+       (void) fprintf(stderr, gettext("internal error: out of memory\n"));
+       exit(1);
+}
+
 /*
  * Utility function to guarantee malloc() success.
  */
+
 void *
 safe_malloc(size_t size)
 {
        void *data;
 
-       if ((data = calloc(1, size)) == NULL) {
-               (void) fprintf(stderr, "internal error: out of memory\n");
-               exit(1);
-       }
+       if ((data = calloc(1, size)) == NULL)
+               nomem();
 
        return (data);
 }
 
+static char *
+safe_strdup(char *str)
+{
+       char *dupstr = strdup(str);
+
+       if (dupstr == NULL)
+               nomem();
+
+       return (dupstr);
+}
+
 /*
  * Callback routine that will print out information for each of
  * the properties.
@@ -442,11 +460,8 @@ parseprop(nvlist_t *props)
                    "specified multiple times\n"), propname);
                return (-1);
        }
-       if (nvlist_add_string(props, propname, propval) != 0) {
-               (void) fprintf(stderr, gettext("internal "
-                   "error: out of memory\n"));
-               return (-1);
-       }
+       if (nvlist_add_string(props, propname, propval) != 0)
+               nomem();
        return (0);
 }
 
@@ -471,6 +486,59 @@ parse_depth(char *opt, int *flags)
        return (depth);
 }
 
+#define        PROGRESS_DELAY 2                /* seconds */
+
+static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+static time_t pt_begin;
+static char *pt_header = NULL;
+static boolean_t pt_shown;
+
+static void
+start_progress_timer(void)
+{
+       pt_begin = time(NULL) + PROGRESS_DELAY;
+       pt_shown = B_FALSE;
+}
+
+static void
+set_progress_header(char *header)
+{
+       assert(pt_header == NULL);
+       pt_header = safe_strdup(header);
+       if (pt_shown) {
+               (void) printf("%s: ", header);
+               (void) fflush(stdout);
+       }
+}
+
+static void
+update_progress(char *update)
+{
+       if (!pt_shown && time(NULL) > pt_begin) {
+               int len = strlen(update);
+
+               (void) printf("%s: %s%*.*s", pt_header, update, len, len,
+                   pt_reverse);
+               (void) fflush(stdout);
+               pt_shown = B_TRUE;
+       } else if (pt_shown) {
+               int len = strlen(update);
+
+               (void) printf("%s%*.*s", update, len, len, pt_reverse);
+               (void) fflush(stdout);
+       }
+}
+
+static void
+finish_progress(char *done)
+{
+       if (pt_shown) {
+               (void) printf("%s\n", done);
+               (void) fflush(stdout);
+       }
+       free(pt_header);
+       pt_header = NULL;
+}
 /*
  * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
  *
@@ -490,11 +558,8 @@ zfs_do_clone(int argc, char **argv)
        int ret;
        int c;
 
-       if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
-               (void) fprintf(stderr, gettext("internal error: "
-                   "out of memory\n"));
-               return (1);
-       }
+       if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+               nomem();
 
        /* check options */
        while ((c = getopt(argc, argv, "o:p")) != -1) {
@@ -608,11 +673,8 @@ zfs_do_create(int argc, char **argv)
        uint64_t intval;
        int canmount;
 
-       if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
-               (void) fprintf(stderr, gettext("internal error: "
-                   "out of memory\n"));
-               return (1);
-       }
+       if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+               nomem();
 
        /* check options */
        while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) {
@@ -627,12 +689,8 @@ zfs_do_create(int argc, char **argv)
                        }
 
                        if (nvlist_add_uint64(props,
-                           zfs_prop_to_name(ZFS_PROP_VOLSIZE),
-                           intval) != 0) {
-                               (void) fprintf(stderr, gettext("internal "
-                                   "error: out of memory\n"));
-                               goto error;
-                       }
+                           zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
+                               nomem();
                        volsize = intval;
                        break;
                case 'p':
@@ -649,11 +707,8 @@ zfs_do_create(int argc, char **argv)
 
                        if (nvlist_add_uint64(props,
                            zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-                           intval) != 0) {
-                               (void) fprintf(stderr, gettext("internal "
-                                   "error: out of memory\n"));
-                               goto error;
-                       }
+                           intval) != 0)
+                               nomem();
                        break;
                case 'o':
                        if (parseprop(props))
@@ -715,15 +770,14 @@ zfs_do_create(int argc, char **argv)
                        resv_prop = ZFS_PROP_REFRESERVATION;
                else
                        resv_prop = ZFS_PROP_RESERVATION;
+               volsize = zvol_volsize_to_reservation(volsize, props);
 
                if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
                    &strval) != 0) {
                        if (nvlist_add_uint64(props,
                            zfs_prop_to_name(resv_prop), volsize) != 0) {
-                               (void) fprintf(stderr, gettext("internal "
-                                   "error: out of memory\n"));
                                nvlist_free(props);
-                               return (1);
+                               nomem();
                        }
                }
        }
@@ -785,12 +839,12 @@ badusage:
 }
 
 /*
- * zfs destroy [-rRf] <fs, snap, vol>
- * zfs destroy -d [-r] <fs, snap, vol>
+ * zfs destroy [-rRf] <fs, vol>
+ * zfs destroy [-rRd] <snap>
  *
- *     -r      Recursively destroy all children
- *     -R      Recursively destroy all dependents, including clones
- *     -f      Force unmounting of any dependents
+ *     -r      Recursively destroy all children
+ *     -R      Recursively destroy all dependents, including clones
+ *     -f      Force unmounting of any dependents
  *     -d      If we can't destroy now, mark for deferred destruction
  *
  * Destroys the given dataset.  By default, it will unmount any filesystems,
@@ -876,7 +930,7 @@ destroy_callback(zfs_handle_t *zhp, void *data)
 
        /*
         * Ignore pools (which we've already flagged as an error before getting
-        * here.
+        * here).
         */
        if (strchr(zfs_get_name(zhp), '/') == NULL &&
            zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
@@ -940,12 +994,14 @@ zfs_do_destroy(int argc, char **argv)
        int c;
        zfs_handle_t *zhp;
        char *cp;
+       zfs_type_t type = ZFS_TYPE_DATASET;
 
        /* check options */
        while ((c = getopt(argc, argv, "dfrR")) != -1) {
                switch (c) {
                case 'd':
                        cb.cb_defer_destroy = B_TRUE;
+                       type = ZFS_TYPE_SNAPSHOT;
                        break;
                case 'f':
                        cb.cb_force = 1;
@@ -978,9 +1034,6 @@ zfs_do_destroy(int argc, char **argv)
                usage(B_FALSE);
        }
 
-       if (cb.cb_defer_destroy && cb.cb_doclones)
-               usage(B_FALSE);
-
        /*
         * If we are doing recursive destroy of a snapshot, then the
         * named snapshot may not exist.  Go straight to libzfs.
@@ -995,11 +1048,19 @@ zfs_do_destroy(int argc, char **argv)
                cp++;
 
                if (cb.cb_doclones) {
+                       boolean_t defer = cb.cb_defer_destroy;
+
+                       /*
+                        * Temporarily ignore the defer_destroy setting since
+                        * it's not supported for clones.
+                        */
+                       cb.cb_defer_destroy = B_FALSE;
                        cb.cb_snapname = cp;
                        if (destroy_snap_clones(zhp, &cb) != 0) {
                                zfs_close(zhp);
                                return (1);
                        }
+                       cb.cb_defer_destroy = defer;
                }
 
                ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy);
@@ -1012,7 +1073,7 @@ zfs_do_destroy(int argc, char **argv)
        }
 
        /* Open the given dataset */
-       if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
+       if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
                return (1);
 
        cb.cb_target = zhp;
@@ -1062,18 +1123,32 @@ zfs_do_destroy(int argc, char **argv)
        return (0);
 }
 
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+       int i;
+       zfs_get_column_t col;
+
+       for (i = 0; i < ZFS_GET_NCOLS &&
+           (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+               if (col == GET_COL_RECVD)
+                       return (B_TRUE);
+       return (B_FALSE);
+}
+
 /*
- * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...]
- *     < all | property[,property]... > < fs | snap | vol > ...
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ *     < all | property[,property]... > < fs | snap | vol > ...
  *
  *     -r      recurse over any child datasets
  *     -H      scripted mode.  Headers are stripped, and fields are separated
  *             by tabs instead of spaces.
- *     -o      Set of fields to display.  One of "name,property,value,source".
- *             Default is all four.
+ *     -o      Set of fields to display.  One of "name,property,value,
+ *             received,source". Default is "name,property,value,source".
+ *             "all" is an alias for all five.
  *     -s      Set of sources to allow.  One of
- *             "local,default,inherited,temporary,none".  Default is all
- *             five.
+ *             "local,default,inherited,received,temporary,none".  Default is
+ *             all six.
  *     -p      Display values in parsable (literal) format.
  *
  *  Prints properties for the given datasets.  The user can control which
@@ -1087,16 +1162,19 @@ static int
 get_callback(zfs_handle_t *zhp, void *data)
 {
        char buf[ZFS_MAXPROPLEN];
+       char rbuf[ZFS_MAXPROPLEN];
        zprop_source_t sourcetype;
        char source[ZFS_MAXNAMELEN];
        zprop_get_cbdata_t *cbp = data;
-       nvlist_t *userprop = zfs_get_user_props(zhp);
+       nvlist_t *user_props = zfs_get_user_props(zhp);
        zprop_list_t *pl = cbp->cb_proplist;
        nvlist_t *propval;
        char *strval;
        char *sourceval;
+       boolean_t received = is_recvd_column(cbp);
 
        for (; pl != NULL; pl = pl->pl_next) {
+               char *recvdval = NULL;
                /*
                 * Skip the special fake placeholder.  This will also skip over
                 * the name property when 'all' is specified.
@@ -1123,9 +1201,14 @@ get_callback(zfs_handle_t *zhp, void *data)
                                (void) strlcpy(buf, "-", sizeof (buf));
                        }
 
+                       if (received && (zfs_prop_get_recvd(zhp,
+                           zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
+                           cbp->cb_literal) == 0))
+                               recvdval = rbuf;
+
                        zprop_print_one_property(zfs_get_name(zhp), cbp,
                            zfs_prop_to_name(pl->pl_prop),
-                           buf, sourcetype, source);
+                           buf, sourcetype, source, recvdval);
                } else if (zfs_prop_userquota(pl->pl_user_prop)) {
                        sourcetype = ZPROP_SRC_LOCAL;
 
@@ -1136,9 +1219,9 @@ get_callback(zfs_handle_t *zhp, void *data)
                        }
 
                        zprop_print_one_property(zfs_get_name(zhp), cbp,
-                           pl->pl_user_prop, buf, sourcetype, source);
+                           pl->pl_user_prop, buf, sourcetype, source, NULL);
                } else {
-                       if (nvlist_lookup_nvlist(userprop,
+                       if (nvlist_lookup_nvlist(user_props,
                            pl->pl_user_prop, &propval) != 0) {
                                if (pl->pl_all)
                                        continue;
@@ -1153,6 +1236,9 @@ get_callback(zfs_handle_t *zhp, void *data)
                                if (strcmp(sourceval,
                                    zfs_get_name(zhp)) == 0) {
                                        sourcetype = ZPROP_SRC_LOCAL;
+                               } else if (strcmp(sourceval,
+                                   ZPROP_SOURCE_VAL_RECVD) == 0) {
+                                       sourcetype = ZPROP_SRC_RECEIVED;
                                } else {
                                        sourcetype = ZPROP_SRC_INHERITED;
                                        (void) strlcpy(source,
@@ -1160,9 +1246,14 @@ get_callback(zfs_handle_t *zhp, void *data)
                                }
                        }
 
+                       if (received && (zfs_prop_get_recvd(zhp,
+                           pl->pl_user_prop, rbuf, sizeof (rbuf),
+                           cbp->cb_literal) == 0))
+                               recvdval = rbuf;
+
                        zprop_print_one_property(zfs_get_name(zhp), cbp,
                            pl->pl_user_prop, strval, sourcetype,
-                           source);
+                           source, recvdval);
                }
        }
 
@@ -1218,10 +1309,10 @@ zfs_do_get(int argc, char **argv)
                        i = 0;
                        while (*optarg != '\0') {
                                static char *col_subopts[] =
-                                   { "name", "property", "value", "source",
-                                   NULL };
+                                   { "name", "property", "value", "received",
+                                   "source", "all", NULL };
 
-                               if (i == 4) {
+                               if (i == ZFS_GET_NCOLS) {
                                        (void) fprintf(stderr, gettext("too "
                                            "many fields given to -o "
                                            "option\n"));
@@ -1240,8 +1331,28 @@ zfs_do_get(int argc, char **argv)
                                        cb.cb_columns[i++] = GET_COL_VALUE;
                                        break;
                                case 3:
+                                       cb.cb_columns[i++] = GET_COL_RECVD;
+                                       flags |= ZFS_ITER_RECVD_PROPS;
+                                       break;
+                               case 4:
                                        cb.cb_columns[i++] = GET_COL_SOURCE;
                                        break;
+                               case 5:
+                                       if (i > 0) {
+                                               (void) fprintf(stderr,
+                                                   gettext("\"all\" conflicts "
+                                                   "with specific fields "
+                                                   "given to -o option\n"));
+                                               usage(B_FALSE);
+                                       }
+                                       cb.cb_columns[0] = GET_COL_NAME;
+                                       cb.cb_columns[1] = GET_COL_PROPERTY;
+                                       cb.cb_columns[2] = GET_COL_VALUE;
+                                       cb.cb_columns[3] = GET_COL_RECVD;
+                                       cb.cb_columns[4] = GET_COL_SOURCE;
+                                       flags |= ZFS_ITER_RECVD_PROPS;
+                                       i = ZFS_GET_NCOLS;
+                                       break;
                                default:
                                        (void) fprintf(stderr,
                                            gettext("invalid column name "
@@ -1256,7 +1367,8 @@ zfs_do_get(int argc, char **argv)
                        while (*optarg != '\0') {
                                static char *source_subopts[] = {
                                        "local", "default", "inherited",
-                                       "temporary", "none", NULL };
+                                       "received", "temporary", "none",
+                                       NULL };
 
                                switch (getsubopt(&optarg, source_subopts,
                                    &value)) {
@@ -1270,9 +1382,12 @@ zfs_do_get(int argc, char **argv)
                                        cb.cb_sources |= ZPROP_SRC_INHERITED;
                                        break;
                                case 3:
-                                       cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+                                       cb.cb_sources |= ZPROP_SRC_RECEIVED;
                                        break;
                                case 4:
+                                       cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+                                       break;
+                               case 5:
                                        cb.cb_sources |= ZPROP_SRC_NONE;
                                        break;
                                default:
@@ -1339,9 +1454,10 @@ zfs_do_get(int argc, char **argv)
 }
 
 /*
- * inherit [-r] <property> <fs|vol> ...
+ * inherit [-rS] <property> <fs|vol> ...
  *
- *     -r      Recurse over all children
+ *     -r      Recurse over all children
+ *     -S      Revert to received value, if any
  *
  * For each dataset specified on the command line, inherit the given property
  * from its parent.  Inheriting a property at the pool level will cause it to
@@ -1350,11 +1466,16 @@ zfs_do_get(int argc, char **argv)
  * local modifications for each dataset.
  */
 
+typedef struct inherit_cbdata {
+       const char *cb_propname;
+       boolean_t cb_received;
+} inherit_cbdata_t;
+
 static int
 inherit_recurse_cb(zfs_handle_t *zhp, void *data)
 {
-       char *propname = data;
-       zfs_prop_t prop = zfs_name_to_prop(propname);
+       inherit_cbdata_t *cb = data;
+       zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
 
        /*
         * If we're doing it recursively, then ignore properties that
@@ -1364,15 +1485,15 @@ inherit_recurse_cb(zfs_handle_t *zhp, void *data)
            !zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
                return (0);
 
-       return (zfs_prop_inherit(zhp, propname) != 0);
+       return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
 }
 
 static int
 inherit_cb(zfs_handle_t *zhp, void *data)
 {
-       char *propname = data;
+       inherit_cbdata_t *cb = data;
 
-       return (zfs_prop_inherit(zhp, propname) != 0);
+       return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
 }
 
 static int
@@ -1380,16 +1501,21 @@ zfs_do_inherit(int argc, char **argv)
 {
        int c;
        zfs_prop_t prop;
+       inherit_cbdata_t cb = { 0 };
        char *propname;
        int ret;
        int flags = 0;
+       boolean_t received = B_FALSE;
 
        /* check options */
-       while ((c = getopt(argc, argv, "r")) != -1) {
+       while ((c = getopt(argc, argv, "rS")) != -1) {
                switch (c) {
                case 'r':
                        flags |= ZFS_ITER_RECURSE;
                        break;
+               case 'S':
+                       received = B_TRUE;
+                       break;
                case '?':
                default:
                        (void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -1422,7 +1548,7 @@ zfs_do_inherit(int argc, char **argv)
                            propname);
                        return (1);
                }
-               if (!zfs_prop_inheritable(prop)) {
+               if (!zfs_prop_inheritable(prop) && !received) {
                        (void) fprintf(stderr, gettext("'%s' property cannot "
                            "be inherited\n"), propname);
                        if (prop == ZFS_PROP_QUOTA ||
@@ -1433,18 +1559,27 @@ zfs_do_inherit(int argc, char **argv)
                                    "%s=none' to clear\n"), propname);
                        return (1);
                }
+               if (received && (prop == ZFS_PROP_VOLSIZE ||
+                   prop == ZFS_PROP_VERSION)) {
+                       (void) fprintf(stderr, gettext("'%s' property cannot "
+                           "be reverted to a received value\n"), propname);
+                       return (1);
+               }
        } else if (!zfs_prop_user(propname)) {
                (void) fprintf(stderr, gettext("invalid property '%s'\n"),
                    propname);
                usage(B_FALSE);
        }
 
+       cb.cb_propname = propname;
+       cb.cb_received = received;
+
        if (flags & ZFS_ITER_RECURSE) {
                ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
-                   NULL, NULL, 0, inherit_recurse_cb, propname);
+                   NULL, NULL, 0, inherit_recurse_cb, &cb);
        } else {
                ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
-                   NULL, NULL, 0, inherit_cb, propname);
+                   NULL, NULL, 0, inherit_cb, &cb);
        }
 
        return (ret);
@@ -1513,31 +1648,25 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data)
 {
        upgrade_cbdata_t *cb = data;
        int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
-       int i;
-       static struct { int zplver; int spaver; } table[] = {
-               {ZPL_VERSION_FUID, SPA_VERSION_FUID},
-               {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
-               {0, 0}
-       };
-
-
-       for (i = 0; table[i].zplver; i++) {
-               if (cb->cb_version >= table[i].zplver) {
-                       int spa_version;
-
-                       if (zfs_spa_version(zhp, &spa_version) < 0)
-                               return (-1);
-
-                       if (spa_version < table[i].spaver) {
-                               /* can't upgrade */
-                               (void) printf(gettext("%s: can not be "
-                                   "upgraded; the pool version needs to first "
-                                   "be upgraded\nto version %d\n\n"),
-                                   zfs_get_name(zhp), table[i].spaver);
-                               cb->cb_numfailed++;
-                               return (0);
-                       }
-               }
+       int needed_spa_version;
+       int spa_version;
+
+       if (zfs_spa_version(zhp, &spa_version) < 0)
+               return (-1);
+
+       needed_spa_version = zfs_spa_version_map(cb->cb_version);
+
+       if (needed_spa_version < 0)
+               return (-1);
+
+       if (spa_version < needed_spa_version) {
+               /* can't upgrade */
+               (void) printf(gettext("%s: can not be "
+                   "upgraded; the pool version needs to first "
+                   "be upgraded\nto version %d\n\n"),
+                   zfs_get_name(zhp), needed_spa_version);
+               cb->cb_numfailed++;
+               return (0);
        }
 
        /* upgrade */
@@ -1639,11 +1768,10 @@ zfs_do_upgrade(int argc, char **argv)
                    "unique identifier (FUID)\n"));
                (void) printf(gettext(" 4   userquota, groupquota "
                    "properties\n"));
+               (void) printf(gettext(" 5   System attributes\n"));
                (void) printf(gettext("\nFor more information on a particular "
-                   "version, including supported releases, see:\n\n"));
-               (void) printf("http://www.opensolaris.org/os/community/zfs/"
-                   "version/zpl/N\n\n");
-               (void) printf(gettext("Where 'N' is the version number.\n"));
+                   "version, including supported releases,\n"));
+               (void) printf("see the ZFS Administration Guide.\n\n");
                ret = 0;
        } else if (argc || all) {
                /* Upgrade filesystems */
@@ -1769,11 +1897,11 @@ zfs_do_userspace(int argc, char **argv)
  *      [-s property [-s property]...] [-S property [-S property]...]
  *      <dataset> ...
  *
- *     -r      Recurse over all children
- *     -d      Limit recursion by depth.
- *     -H      Scripted mode; elide headers and separate columns by tabs
- *     -o      Control which fields to display.
- *     -t      Control which object types to display.
+ *     -r      Recurse over all children
+ *     -d      Limit recursion by depth.
+ *     -H      Scripted mode; elide headers and separate columns by tabs
+ *     -o      Control which fields to display.
+ *     -t      Control which object types to display.
  *     -s      Specify sort columns, descending order.
  *     -S      Specify sort columns, ascending order.
  *
@@ -2170,9 +2298,9 @@ zfs_do_promote(int argc, char **argv)
 /*
  * zfs rollback [-rRf] <snapshot>
  *
- *     -r      Delete any intervening snapshots before doing rollback
- *     -R      Delete any snapshots and their clones
- *     -f      ignored for backwards compatability
+ *     -r      Delete any intervening snapshots before doing rollback
+ *     -R      Delete any snapshots and their clones
+ *     -f      ignored for backwards compatability
  *
  * Given a filesystem, rollback to a specific snapshot, discarding any changes
  * since then and making it the active dataset.  If more recent snapshots exist,
@@ -2433,11 +2561,8 @@ zfs_do_snapshot(int argc, char **argv)
        char c;
        nvlist_t *props;
 
-       if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
-               (void) fprintf(stderr, gettext("internal error: "
-                   "out of memory\n"));
-               return (1);
-       }
+       if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+               nomem();
 
        /* check options */
        while ((c = getopt(argc, argv, "ro:")) != -1) {
@@ -2482,8 +2607,8 @@ usage:
 }
 
 /*
- * zfs send [-v] -R [-i|-I <@snap>] <fs@snap>
- * zfs send [-v] [-i|-I <@snap>] <fs@snap>
+ * zfs send [-vDp] -R [-i|-I <@snap>] <fs@snap>
+ * zfs send [-vDp] [-i|-I <@snap>] <fs@snap>
  *
  * Send a backup stream to stdout.
  */
@@ -2494,14 +2619,13 @@ zfs_do_send(int argc, char **argv)
        char *toname = NULL;
        char *cp;
        zfs_handle_t *zhp;
-       boolean_t doall = B_FALSE;
-       boolean_t replicate = B_FALSE;
-       boolean_t fromorigin = B_FALSE;
-       boolean_t verbose = B_FALSE;
+       sendflags_t flags = { 0 };
        int c, err;
+       nvlist_t *dbgnv;
+       boolean_t extraverbose = B_FALSE;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":i:I:Rv")) != -1) {
+       while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) {
                switch (c) {
                case 'i':
                        if (fromname)
@@ -2512,13 +2636,21 @@ zfs_do_send(int argc, char **argv)
                        if (fromname)
                                usage(B_FALSE);
                        fromname = optarg;
-                       doall = B_TRUE;
+                       flags.doall = B_TRUE;
                        break;
                case 'R':
-                       replicate = B_TRUE;
+                       flags.replicate = B_TRUE;
+                       break;
+               case 'p':
+                       flags.props = B_TRUE;
                        break;
                case 'v':
-                       verbose = B_TRUE;
+                       if (flags.verbose)
+                               extraverbose = B_TRUE;
+                       flags.verbose = B_TRUE;
+                       break;
+               case 'D':
+                       flags.dedup = B_TRUE;
                        break;
                case ':':
                        (void) fprintf(stderr, gettext("missing argument for "
@@ -2578,7 +2710,7 @@ zfs_do_send(int argc, char **argv)
 
                if (strcmp(origin, fromname) == 0) {
                        fromname = NULL;
-                       fromorigin = B_TRUE;
+                       flags.fromorigin = B_TRUE;
                } else {
                        *cp = '\0';
                        if (cp != fromname && strcmp(argv[0], fromname)) {
@@ -2596,18 +2728,29 @@ zfs_do_send(int argc, char **argv)
                }
        }
 
-       if (replicate && fromname == NULL)
-               doall = B_TRUE;
+       if (flags.replicate && fromname == NULL)
+               flags.doall = B_TRUE;
+
+       err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0,
+           extraverbose ? &dbgnv : NULL);
 
-       err = zfs_send(zhp, fromname, toname, replicate, doall, fromorigin,
-           verbose, STDOUT_FILENO);
+       if (extraverbose) {
+               /*
+                * dump_nvlist prints to stdout, but that's been
+                * redirected to a file.  Make it print to stderr
+                * instead.
+                */
+               (void) dup2(STDERR_FILENO, STDOUT_FILENO);
+               dump_nvlist(dbgnv, 0);
+               nvlist_free(dbgnv);
+       }
        zfs_close(zhp);
 
        return (err != 0);
 }
 
 /*
- * zfs receive [-dnvF] <fs@snap>
+ * zfs receive [-vnFu] [-d | -e] <fs@snap>
  *
  * Restore a backup stream from stdin.
  */
@@ -2615,15 +2758,18 @@ static int
 zfs_do_receive(int argc, char **argv)
 {
        int c, err;
-       recvflags_t flags;
+       recvflags_t flags = { 0 };
 
-       bzero(&flags, sizeof (recvflags_t));
        /* check options */
-       while ((c = getopt(argc, argv, ":dnuvF")) != -1) {
+       while ((c = getopt(argc, argv, ":denuvF")) != -1) {
                switch (c) {
                case 'd':
                        flags.isprefix = B_TRUE;
                        break;
+               case 'e':
+                       flags.isprefix = B_TRUE;
+                       flags.istail = B_TRUE;
+                       break;
                case 'n':
                        flags.dryrun = B_TRUE;
                        break;
@@ -2681,15 +2827,19 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
        int i;
        const char *tag;
        boolean_t recursive = B_FALSE;
+       boolean_t temphold = B_FALSE;
+       const char *opts = holding ? "rt" : "r";
        int c;
-       int (*func)(zfs_handle_t *, const char *, const char *, boolean_t);
 
        /* check options */
-       while ((c = getopt(argc, argv, "r")) != -1) {
+       while ((c = getopt(argc, argv, opts)) != -1) {
                switch (c) {
                case 'r':
                        recursive = B_TRUE;
                        break;
+               case 't':
+                       temphold = B_TRUE;
+                       break;
                case '?':
                        (void) fprintf(stderr, gettext("invalid option '%c'\n"),
                            optopt);
@@ -2708,16 +2858,10 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
        --argc;
        ++argv;
 
-       if (holding) {
-               if (tag[0] == '.') {
-                       /* tags starting with '.' are reserved for libzfs */
-                       (void) fprintf(stderr,
-                           gettext("tag may not start with '.'\n"));
-                       usage(B_FALSE);
-               }
-               func = zfs_hold;
-       } else {
-               func = zfs_release;
+       if (holding && tag[0] == '.') {
+               /* tags starting with '.' are reserved for libzfs */
+               (void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+               usage(B_FALSE);
        }
 
        for (i = 0; i < argc; ++i) {
@@ -2742,8 +2886,14 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
                        ++errors;
                        continue;
                }
-               if (func(zhp, delim+1, tag, recursive) != 0)
-                       ++errors;
+               if (holding) {
+                       if (zfs_hold(zhp, delim+1, tag, recursive,
+                           temphold, B_FALSE) != 0)
+                               ++errors;
+               } else {
+                       if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+                               ++errors;
+               }
                zfs_close(zhp);
        }
 
@@ -2751,9 +2901,10 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
 }
 
 /*
- * zfs hold [-r] <tag> <snap> ...
+ * zfs hold [-r] [-t] <tag> <snap> ...
  *
- *     -r      Recursively hold
+ *     -r      Recursively hold
+ *     -t      Temporary hold (hidden option)
  *
  * Apply a user-hold with the given tag to the list of snapshots.
  */
@@ -2766,7 +2917,7 @@ zfs_do_hold(int argc, char **argv)
 /*
  * zfs release [-r] <tag> <snap> ...
  *
- *     -r      Recursively release
+ *     -r      Recursively release
  *
  * Release a user-hold with the given tag from the list of snapshots.
  */
@@ -2791,7 +2942,7 @@ typedef struct get_all_cbdata {
 static int
 get_one_dataset(zfs_handle_t *zhp, void *data)
 {
-       static char spin[] = { '-', '\\', '|', '/' };
+       static char *spin[] = { "-", "\\", "|", "/" };
        static int spinval = 0;
        static int spincheck = 0;
        static time_t last_spin_time = (time_t)0;
@@ -2802,8 +2953,7 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
                if (--spincheck < 0) {
                        time_t now = time(NULL);
                        if (last_spin_time + SPINNER_TIME < now) {
-                               (void) printf("\b%c", spin[spinval++ % 4]);
-                               (void) fflush(stdout);
+                               update_progress(spin[spinval++ % 4]);
                                last_spin_time = now;
                        }
                        spincheck = CHECK_SPINNER;
@@ -2859,19 +3009,16 @@ get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count,
        cb.cb_types = types;
        cb.cb_verbose = verbose;
 
-       if (verbose) {
-               (void) printf("%s: *", gettext("Reading ZFS config"));
-               (void) fflush(stdout);
-       }
+       if (verbose)
+               set_progress_header(gettext("Reading ZFS config"));
 
        (void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
 
        *dslist = cb.cb_handles;
        *count = cb.cb_used;
 
-       if (verbose) {
-               (void) printf("\b%s\n", gettext("done."));
-       }
+       if (verbose)
+               finish_progress(gettext("done."));
 }
 
 static int
@@ -3098,42 +3245,9 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
                                return (1);
                        break;
                }
-       } else {
+       } else
                assert(op == OP_SHARE);
 
-               /*
-                * Ignore any volumes that aren't shared.
-                */
-               verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
-                   sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
-
-               if (strcmp(shareopts, "off") == 0) {
-                       if (!explicit)
-                               return (0);
-
-                       (void) fprintf(stderr, gettext("cannot share '%s': "
-                           "'shareiscsi' property not set\n"),
-                           zfs_get_name(zhp));
-                       (void) fprintf(stderr, gettext("set 'shareiscsi' "
-                           "property or use iscsitadm(1M) to share this "
-                           "volume\n"));
-                       return (1);
-               }
-
-               if (zfs_is_shared_iscsi(zhp)) {
-                       if (!explicit)
-                               return (0);
-
-                       (void) fprintf(stderr, gettext("cannot share "
-                           "'%s': volume already shared\n"),
-                           zfs_get_name(zhp));
-                       return (1);
-               }
-
-               if (zfs_share_iscsi(zhp) != 0)
-                       return (1);
-       }
-
        return (0);
 }
 
@@ -3143,19 +3257,16 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
 static void
 report_mount_progress(int current, int total)
 {
-       static int len;
-       static char *reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"
-           "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
-       static time_t last_progress_time;
+       static time_t last_progress_time = 0;
        time_t now = time(NULL);
+       char info[32];
 
        /* report 1..n instead of 0..n-1 */
        ++current;
 
        /* display header if we're here for the first time */
        if (current == 1) {
-               (void) printf(gettext("Mounting ZFS filesystems: "));
-               len = 0;
+               set_progress_header(gettext("Mounting ZFS filesystems"));
        } else if (current != total && last_progress_time + MOUNT_TIME >= now) {
                /* too soon to report again */
                return;
@@ -3163,13 +3274,12 @@ report_mount_progress(int current, int total)
 
        last_progress_time = now;
 
-       /* back up to prepare for overwriting */
-       if (len)
-               (void) printf("%*.*s", len, len, reverse);
+       (void) sprintf(info, "(%d/%d)", current, total);
 
-       /* We put a newline at the end if this is the last one.  */
-       len = printf("(%d/%d)%s", current, total, current == total ? "\n" : "");
-       (void) fflush(stdout);
+       if (current == total)
+               finish_progress(info);
+       else
+               update_progress(info);
 }
 
 static void
@@ -3254,11 +3364,9 @@ share_mount(int op, int argc, char **argv)
                        if (strcmp(argv[0], "nfs") == 0 ||
                            strcmp(argv[0], "smb") == 0) {
                                types = ZFS_TYPE_FILESYSTEM;
-                       } else if (strcmp(argv[0], "iscsi") == 0) {
-                               types = ZFS_TYPE_VOLUME;
                        } else {
                                (void) fprintf(stderr, gettext("share type "
-                                   "must be 'nfs', 'smb' or 'iscsi'\n"));
+                                   "must be 'nfs' or 'smb'\n"));
                                usage(B_FALSE);
                        }
                        protocol = argv[0];
@@ -3273,6 +3381,7 @@ share_mount(int op, int argc, char **argv)
                        usage(B_FALSE);
                }
 
+               start_progress_timer();
                get_all_datasets(types, &dslist, &count, verbose);
 
                if (count == 0)
@@ -3341,7 +3450,7 @@ share_mount(int op, int argc, char **argv)
 }
 
 /*
- * zfs mount -a [nfs | iscsi]
+ * zfs mount -a [nfs]
  * zfs mount filesystem
  *
  * Mount all filesystems, or mount the given filesystem.
@@ -3353,7 +3462,7 @@ zfs_do_mount(int argc, char **argv)
 }
 
 /*
- * zfs share -a [nfs | iscsi | smb]
+ * zfs share -a [nfs | smb]
  * zfs share filesystem
  *
  * Share all filesystems, or share the given filesystem.
@@ -3509,7 +3618,7 @@ unshare_unmount(int op, int argc, char **argv)
        int ret = 0;
        int types, c;
        zfs_handle_t *zhp;
-       char nfsiscsi_mnt_prop[ZFS_MAXPROPLEN];
+       char nfs_mnt_prop[ZFS_MAXPROPLEN];
        char sharesmb[ZFS_MAXPROPLEN];
 
        /* check options */
@@ -3558,21 +3667,12 @@ unshare_unmount(int op, int argc, char **argv)
                        usage(B_FALSE);
                }
 
-               if ((pool = uu_avl_pool_create("unmount_pool",
+               if (((pool = uu_avl_pool_create("unmount_pool",
                    sizeof (unshare_unmount_node_t),
                    offsetof(unshare_unmount_node_t, un_avlnode),
-                   unshare_unmount_compare,
-                   UU_DEFAULT)) == NULL) {
-                       (void) fprintf(stderr, gettext("internal error: "
-                           "out of memory\n"));
-                       exit(1);
-               }
-
-               if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
-                       (void) fprintf(stderr, gettext("internal error: "
-                           "out of memory\n"));
-                       exit(1);
-               }
+                   unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
+                   ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
+                       nomem();
 
                rewind(mnttab_file);
                while (getmntent(mnttab_file, &entry) == 0) {
@@ -3594,25 +3694,25 @@ unshare_unmount(int op, int argc, char **argv)
                        switch (op) {
                        case OP_SHARE:
                                verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
-                                   nfsiscsi_mnt_prop,
-                                   sizeof (nfsiscsi_mnt_prop),
+                                   nfs_mnt_prop,
+                                   sizeof (nfs_mnt_prop),
                                    NULL, NULL, 0, B_FALSE) == 0);
-                               if (strcmp(nfsiscsi_mnt_prop, "off") != 0)
+                               if (strcmp(nfs_mnt_prop, "off") != 0)
                                        break;
                                verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
-                                   nfsiscsi_mnt_prop,
-                                   sizeof (nfsiscsi_mnt_prop),
+                                   nfs_mnt_prop,
+                                   sizeof (nfs_mnt_prop),
                                    NULL, NULL, 0, B_FALSE) == 0);
-                               if (strcmp(nfsiscsi_mnt_prop, "off") == 0)
+                               if (strcmp(nfs_mnt_prop, "off") == 0)
                                        continue;
                                break;
                        case OP_MOUNT:
                                /* Ignore legacy mounts */
                                verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
-                                   nfsiscsi_mnt_prop,
-                                   sizeof (nfsiscsi_mnt_prop),
+                                   nfs_mnt_prop,
+                                   sizeof (nfs_mnt_prop),
                                    NULL, NULL, 0, B_FALSE) == 0);
-                               if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0)
+                               if (strcmp(nfs_mnt_prop, "legacy") == 0)
                                        continue;
                                /* Ignore canmount=noauto mounts */
                                if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
@@ -3624,13 +3724,7 @@ unshare_unmount(int op, int argc, char **argv)
 
                        node = safe_malloc(sizeof (unshare_unmount_node_t));
                        node->un_zhp = zhp;
-
-                       if ((node->un_mountp = strdup(entry.mnt_mountp)) ==
-                           NULL) {
-                               (void) fprintf(stderr, gettext("internal error:"
-                                   " out of memory\n"));
-                               exit(1);
-                       }
+                       node->un_mountp = safe_strdup(entry.mnt_mountp);
 
                        uu_avl_node_init(node, &node->un_avlnode, pool);
 
@@ -3648,11 +3742,8 @@ unshare_unmount(int op, int argc, char **argv)
                 * removing it from the AVL tree in the process.
                 */
                if ((walk = uu_avl_walk_start(tree,
-                   UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
-                       (void) fprintf(stderr,
-                           gettext("internal error: out of memory"));
-                       exit(1);
-               }
+                   UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
+                       nomem();
 
                while ((node = uu_avl_walk_next(walk)) != NULL) {
                        uu_avl_remove(tree, node);
@@ -3680,29 +3771,6 @@ unshare_unmount(int op, int argc, char **argv)
                uu_avl_destroy(tree);
                uu_avl_pool_destroy(pool);
 
-               if (op == OP_SHARE) {
-                       /*
-                        * Finally, unshare any volumes shared via iSCSI.
-                        */
-                       zfs_handle_t **dslist = NULL;
-                       size_t i, count = 0;
-
-                       get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count,
-                           B_FALSE);
-
-                       if (count != 0) {
-                               qsort(dslist, count, sizeof (void *),
-                                   dataset_cmp);
-
-                               for (i = 0; i < count; i++) {
-                                       if (zfs_unshare_iscsi(dslist[i]) != 0)
-                                               ret = 1;
-                                       zfs_close(dslist[i]);
-                               }
-
-                               free(dslist);
-                       }
-               }
        } else {
                if (argc != 1) {
                        if (argc == 0)
@@ -3734,20 +3802,20 @@ unshare_unmount(int op, int argc, char **argv)
                if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
                        verify(zfs_prop_get(zhp, op == OP_SHARE ?
                            ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
-                           nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop), NULL,
+                           nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
                            NULL, 0, B_FALSE) == 0);
 
                        switch (op) {
                        case OP_SHARE:
                                verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
-                                   nfsiscsi_mnt_prop,
-                                   sizeof (nfsiscsi_mnt_prop),
+                                   nfs_mnt_prop,
+                                   sizeof (nfs_mnt_prop),
                                    NULL, NULL, 0, B_FALSE) == 0);
                                verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
                                    sharesmb, sizeof (sharesmb), NULL, NULL,
                                    0, B_FALSE) == 0);
 
-                               if (strcmp(nfsiscsi_mnt_prop, "off") == 0 &&
+                               if (strcmp(nfs_mnt_prop, "off") == 0 &&
                                    strcmp(sharesmb, "off") == 0) {
                                        (void) fprintf(stderr, gettext("cannot "
                                            "unshare '%s': legacy share\n"),
@@ -3767,7 +3835,7 @@ unshare_unmount(int op, int argc, char **argv)
                                break;
 
                        case OP_MOUNT:
-                               if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0) {
+                               if (strcmp(nfs_mnt_prop, "legacy") == 0) {
                                        (void) fprintf(stderr, gettext("cannot "
                                            "unmount '%s': legacy "
                                            "mountpoint\n"), zfs_get_name(zhp));
@@ -3786,29 +3854,6 @@ unshare_unmount(int op, int argc, char **argv)
                                }
                                break;
                        }
-               } else {
-                       assert(op == OP_SHARE);
-
-                       verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI,
-                           nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop),
-                           NULL, NULL, 0, B_FALSE) == 0);
-
-                       if (strcmp(nfsiscsi_mnt_prop, "off") == 0) {
-                               (void) fprintf(stderr, gettext("cannot unshare "
-                                   "'%s': 'shareiscsi' property not set\n"),
-                                   zfs_get_name(zhp));
-                               (void) fprintf(stderr, gettext("set "
-                                   "'shareiscsi' property or use "
-                                   "iscsitadm(1M) to share this volume\n"));
-                               ret = 1;
-                       } else if (!zfs_is_shared_iscsi(zhp)) {
-                               (void) fprintf(stderr, gettext("cannot "
-                                   "unshare '%s': not currently shared\n"),
-                                   zfs_get_name(zhp));
-                               ret = 1;
-                       } else if (zfs_unshare_iscsi(zhp) != 0) {
-                               ret = 1;
-                       }
                }
 
                zfs_close(zhp);
@@ -3986,27 +4031,6 @@ manual_unmount(int argc, char **argv)
 }
 
 static int
-volcheck(zpool_handle_t *zhp, void *data)
-{
-       boolean_t isinit = *((boolean_t *)data);
-
-       if (isinit)
-               return (zpool_create_zvol_links(zhp));
-       else
-               return (zpool_remove_zvol_links(zhp));
-}
-
-/*
- * Iterate over all pools in the system and either create or destroy /dev/zvol
- * links, depending on the value of 'isinit'.
- */
-static int
-do_volcheck(boolean_t isinit)
-{
-       return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
-}
-
-static int
 find_command_idx(char *command, int *idx)
 {
        int i;
@@ -4092,15 +4116,6 @@ main(int argc, char **argv)
                        usage(B_TRUE);
 
                /*
-                * 'volinit' and 'volfini' do not appear in the usage message,
-                * so we have to special case them here.
-                */
-               if (strcmp(cmdname, "volinit") == 0)
-                       return (do_volcheck(B_TRUE));
-               else if (strcmp(cmdname, "volfini") == 0)
-                       return (do_volcheck(B_FALSE));
-
-               /*
                 * Run the appropriate command.
                 */
                libzfs_mnttab_cache(g_zfs, B_TRUE);
index c7f2f16..3ddff9e 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _ZFS_UTIL_H
 #define        _ZFS_UTIL_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <libzfs.h>
 
 #ifdef __cplusplus
@@ -35,6 +32,7 @@ extern "C" {
 #endif
 
 void * safe_malloc(size_t size);
+void nomem(void);
 libzfs_handle_t *g_zfs;
 
 #ifdef __cplusplus
index c85e024..cd967a8 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <libzfs.h>
 
-#undef verify  /* both libzfs.h and zfs_context.h want to define this */
-
 #include <sys/zfs_context.h>
 
 #include <errno.h>
@@ -69,6 +66,18 @@ ziprintf(const char *fmt, ...)
        va_end(ap);
 }
 
+static void
+compress_slashes(const char *src, char *dest)
+{
+       while (*src != '\0') {
+               *dest = *src++;
+               while (*dest == '/' && *src == '/')
+                       ++src;
+               ++dest;
+       }
+       *dest = '\0';
+}
+
 /*
  * Given a full path to a file, translate into a dataset name and a relative
  * path within the dataset.  'dataset' must be at least MAXNAMELEN characters,
@@ -76,13 +85,16 @@ ziprintf(const char *fmt, ...)
  * buffer, which we need later to get the object ID.
  */
 static int
-parse_pathname(const char *fullpath, char *dataset, char *relpath,
+parse_pathname(const char *inpath, char *dataset, char *relpath,
     struct stat64 *statbuf)
 {
        struct extmnttab mp;
        FILE *fp;
        int match;
        const char *rel;
+       char fullpath[MAXPATHLEN];
+
+       compress_slashes(inpath, fullpath);
 
        if (fullpath[0] != '/') {
                (void) fprintf(stderr, "invalid object '%s': must be full "
@@ -162,8 +174,8 @@ object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
         */
        sync();
 
-       if ((err = dmu_objset_open(dataset, DMU_OST_ZFS,
-           DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) {
+       err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, FTAG, &os);
+       if (err != 0) {
                (void) fprintf(stderr, "cannot open dataset '%s': %s\n",
                    dataset, strerror(err));
                return (-1);
@@ -172,7 +184,7 @@ object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
        record->zi_objset = dmu_objset_id(os);
        record->zi_object = statbuf->st_ino;
 
-       dmu_objset_close(os);
+       dmu_objset_disown(os, FTAG);
 
        return (0);
 }
@@ -247,17 +259,17 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
         * Get the dnode associated with object, so we can calculate the block
         * size.
         */
-       if ((err = dmu_objset_open(dataset, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) {
+       if ((err = dmu_objset_own(dataset, DMU_OST_ANY,
+           B_TRUE, FTAG, &os)) != 0) {
                (void) fprintf(stderr, "cannot open dataset '%s': %s\n",
                    dataset, strerror(err));
                goto out;
        }
 
        if (record->zi_object == 0) {
-               dn = os->os->os_meta_dnode;
+               dn = os->os_meta_dnode;
        } else {
-               err = dnode_hold(os->os, record->zi_object, FTAG, &dn);
+               err = dnode_hold(os, record->zi_object, FTAG, &dn);
                if (err != 0) {
                        (void) fprintf(stderr, "failed to hold dnode "
                            "for object %llu\n",
@@ -306,11 +318,11 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
        ret = 0;
 out:
        if (dn) {
-               if (dn != os->os->os_meta_dnode)
+               if (dn != os->os_meta_dnode)
                        dnode_rele(dn, FTAG);
        }
        if (os)
-               dmu_objset_close(os);
+               dmu_objset_disown(os, FTAG);
 
        return (ret);
 }
@@ -347,8 +359,8 @@ translate_record(err_type_t type, const char *object, const char *range,
                case TYPE_CONFIG:
                        record->zi_type = DMU_OT_PACKED_NVLIST;
                        break;
-               case TYPE_BPLIST:
-                       record->zi_type = DMU_OT_BPLIST;
+               case TYPE_BPOBJ:
+                       record->zi_type = DMU_OT_BPOBJ;
                        break;
                case TYPE_SPACEMAP:
                        record->zi_type = DMU_OT_SPACE_MAP;
@@ -469,6 +481,14 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
                record->zi_start = offsetof(vdev_label_t, vl_vdev_phys);
                record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1;
                break;
+       case TYPE_LABEL_PAD1:
+               record->zi_start = offsetof(vdev_label_t, vl_pad1);
+               record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+               break;
+       case TYPE_LABEL_PAD2:
+               record->zi_start = offsetof(vdev_label_t, vl_pad2);
+               record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+               break;
        }
        return (0);
 }
index 09c377e..ab04e42 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
  * any attempt to read from the device will return EIO, but any attempt to
  * reopen the device will also return ENXIO.
  * For label faults, the -L option must be specified. This allows faults
- * to be injected into either the nvlist or uberblock region of all the labels
- * for the specified device.
+ * to be injected into either the nvlist, uberblock, pad1, or pad2 region
+ * of all the labels for the specified device.
  *
  * This form of the command looks like:
  *
- *     zinject -d device [-e errno] [-L <uber | nvlist>] pool
+ *     zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
  *
  *
  * DATA FAULTS
@@ -70,7 +69,7 @@
  *             mos             Any data in the MOS
  *             mosdir          object directory
  *             config          pool configuration
- *             bplist          blkptr list
+ *             bpobj           blkptr list
  *             spacemap        spacemap
  *             metaslab        metaslab
  *             errlog          persistent error log
@@ -164,11 +163,13 @@ static const char *errtable[TYPE_INVAL] = {
        "mosdir",
        "metaslab",
        "config",
-       "bplist",
+       "bpobj",
        "spacemap",
        "errlog",
        "uber",
-       "nvlist"
+       "nvlist",
+       "pad1",
+       "pad2"
 };
 
 static err_type_t
@@ -192,8 +193,8 @@ type_to_name(uint64_t type)
                return ("metaslab");
        case DMU_OT_PACKED_NVLIST:
                return ("config");
-       case DMU_OT_BPLIST:
-               return ("bplist");
+       case DMU_OT_BPOBJ:
+               return ("bpobj");
        case DMU_OT_SPACE_MAP:
                return ("spacemap");
        case DMU_OT_ERROR_LOG:
@@ -222,11 +223,28 @@ usage(void)
            "\t\tClear the particular record (if given a numeric ID), or\n"
            "\t\tall records if 'all' is specificed.\n"
            "\n"
-           "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
+           "\tzinject -p <function name> pool\n"
+           "\t\tInject a panic fault at the specified function. Only \n"
+           "\t\tfunctions which call spa_vdev_config_exit(), or \n"
+           "\t\tspa_vdev_exit() will trigger a panic.\n"
+           "\n"
+           "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
+           "\t    [-T <read|write|free|claim|all> pool\n"
            "\t\tInject a fault into a particular device or the device's\n"
-           "\t\tlabel.  Label injection can either be 'nvlist' or 'uber'.\n"
+           "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
+           "\t\t'pad1', or 'pad2'.\n"
            "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
            "\n"
+           "\tzinject -d device -A <degrade|fault> pool\n"
+           "\t\tPerform a specific action on a particular device\n"
+           "\n"
+           "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
+           "\t\tCause the pool to stop writing blocks yet not\n"
+           "\t\treport errors for a duration.  Simulates buggy hardware\n"
+           "\t\tthat fails to honor cache flush requests.\n"
+           "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
+           "\t\tat the end of the duration.\n"
+           "\n"
            "\tzinject -b objset:object:level:blkid pool\n"
            "\n"
            "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
@@ -267,7 +285,7 @@ usage(void)
            "\t\t\ton a ZFS filesystem.\n"
            "\n"
            "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
-           "\t\t\ttype.  Valid types are: mos, mosdir, config, bplist,\n"
+           "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
            "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
            "\t\t\tthe poolname.\n");
 }
@@ -286,6 +304,12 @@ iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
                    &zc.zc_inject_record, data)) != 0)
                        return (ret);
 
+       if (errno != ENOENT) {
+               (void) fprintf(stderr, "Unable to list handlers: %s\n",
+                   strerror(errno));
+               return (-1);
+       }
+
        return (0);
 }
 
@@ -295,7 +319,7 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
 {
        int *count = data;
 
-       if (record->zi_guid != 0)
+       if (record->zi_guid != 0 || record->zi_func[0] != '\0')
                return (0);
 
        if (*count == 0) {
@@ -327,7 +351,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
 {
        int *count = data;
 
-       if (record->zi_guid == 0)
+       if (record->zi_guid == 0 || record->zi_func[0] != '\0')
                return (0);
 
        if (*count == 0) {
@@ -343,6 +367,27 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
        return (0);
 }
 
+static int
+print_panic_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+       int *count = data;
+
+       if (record->zi_func[0] == '\0')
+               return (0);
+
+       if (*count == 0) {
+               (void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
+               (void) printf("---  ---------------  ----------------\n");
+       }
+
+       *count += 1;
+
+       (void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
+
+       return (0);
+}
+
 /*
  * Print all registered error handlers.  Returns the number of handlers
  * registered.
@@ -356,6 +401,9 @@ print_all_handlers(void)
        (void) printf("\n");
        count = 0;
        (void) iter_handlers(print_data_handler, &count);
+       (void) printf("\n");
+       count = 0;
+       (void) iter_handlers(print_panic_handler, &count);
 
        return (count);
 }
@@ -386,7 +434,8 @@ cancel_all_handlers(void)
 {
        int ret = iter_handlers(cancel_one_handler, NULL);
 
-       (void) printf("removed all registered handlers\n");
+       if (ret == 0)
+               (void) printf("removed all registered handlers\n");
 
        return (ret);
 }
@@ -443,6 +492,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
                if (record->zi_guid) {
                        (void) printf("  vdev: %llx\n",
                            (u_longlong_t)record->zi_guid);
+               } else if (record->zi_func[0] != '\0') {
+                       (void) printf("  panic function: %s\n",
+                           record->zi_func);
+               } else if (record->zi_duration > 0) {
+                       (void) printf(" time: %lld seconds\n",
+                           (u_longlong_t)record->zi_duration);
+               } else if (record->zi_duration < 0) {
+                       (void) printf(" txgs: %lld \n",
+                           (u_longlong_t)-record->zi_duration);
                } else {
                        (void) printf("objset: %llu\n",
                            (u_longlong_t)record->zi_objset);
@@ -465,6 +523,22 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
 }
 
 int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+       zfs_cmd_t zc;
+
+       ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+       (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+       zc.zc_guid = record->zi_guid;
+       zc.zc_cookie = cmd;
+
+       if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+               return (0);
+
+       return (1);
+}
+
+int
 main(int argc, char **argv)
 {
        int c;
@@ -477,12 +551,17 @@ main(int argc, char **argv)
        int quiet = 0;
        int error = 0;
        int domount = 0;
+       int io_type = ZIO_TYPES;
+       int action = VDEV_STATE_UNKNOWN;
        err_type_t type = TYPE_INVAL;
        err_type_t label = TYPE_INVAL;
        zinject_record_t record = { 0 };
        char pool[MAXNAMELEN];
        char dataset[MAXNAMELEN];
        zfs_handle_t *zhp;
+       int nowrites = 0;
+       int dur_txg = 0;
+       int dur_secs = 0;
        int ret;
        int flags = 0;
 
@@ -514,11 +593,24 @@ main(int argc, char **argv)
                return (0);
        }
 
-       while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
+       while ((c = getopt(argc, argv,
+           ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
                switch (c) {
                case 'a':
                        flags |= ZINJECT_FLUSH_ARC;
                        break;
+               case 'A':
+                       if (strcasecmp(optarg, "degrade") == 0) {
+                               action = VDEV_STATE_DEGRADED;
+                       } else if (strcasecmp(optarg, "fault") == 0) {
+                               action = VDEV_STATE_FAULTED;
+                       } else {
+                               (void) fprintf(stderr, "invalid action '%s': "
+                                   "must be 'degrade' or 'fault'\n", optarg);
+                               usage();
+                               return (1);
+                       }
+                       break;
                case 'b':
                        raw = optarg;
                        break;
@@ -554,9 +646,27 @@ main(int argc, char **argv)
                case 'F':
                        record.zi_failfast = B_TRUE;
                        break;
+               case 'g':
+                       dur_txg = 1;
+                       record.zi_duration = (int)strtol(optarg, &end, 10);
+                       if (record.zi_duration <= 0 || *end != '\0') {
+                               (void) fprintf(stderr, "invalid duration '%s': "
+                                   "must be a positive integer\n", optarg);
+                               usage();
+                               return (1);
+                       }
+                       /* store duration of txgs as its negative */
+                       record.zi_duration *= -1;
+                       break;
                case 'h':
                        usage();
                        return (0);
+               case 'I':
+                       /* default duration, if one hasn't yet been defined */
+                       nowrites = 1;
+                       if (dur_secs == 0 && dur_txg == 0)
+                               record.zi_duration = 30;
+                       break;
                case 'l':
                        level = (int)strtol(optarg, &end, 10);
                        if (*end != '\0') {
@@ -569,12 +679,45 @@ main(int argc, char **argv)
                case 'm':
                        domount = 1;
                        break;
+               case 'p':
+                       (void) strlcpy(record.zi_func, optarg,
+                           sizeof (record.zi_func));
+                       break;
                case 'q':
                        quiet = 1;
                        break;
                case 'r':
                        range = optarg;
                        break;
+               case 's':
+                       dur_secs = 1;
+                       record.zi_duration = (int)strtol(optarg, &end, 10);
+                       if (record.zi_duration <= 0 || *end != '\0') {
+                               (void) fprintf(stderr, "invalid duration '%s': "
+                                   "must be a positive integer\n", optarg);
+                               usage();
+                               return (1);
+                       }
+                       break;
+               case 'T':
+                       if (strcasecmp(optarg, "read") == 0) {
+                               io_type = ZIO_TYPE_READ;
+                       } else if (strcasecmp(optarg, "write") == 0) {
+                               io_type = ZIO_TYPE_WRITE;
+                       } else if (strcasecmp(optarg, "free") == 0) {
+                               io_type = ZIO_TYPE_FREE;
+                       } else if (strcasecmp(optarg, "claim") == 0) {
+                               io_type = ZIO_TYPE_CLAIM;
+                       } else if (strcasecmp(optarg, "all") == 0) {
+                               io_type = ZIO_TYPES;
+                       } else {
+                               (void) fprintf(stderr, "invalid I/O type "
+                                   "'%s': must be 'read', 'write', 'free', "
+                                   "'claim' or 'all'\n", optarg);
+                               usage();
+                               return (1);
+                       }
+                       break;
                case 't':
                        if ((type = name_to_type(optarg)) == TYPE_INVAL &&
                            !MOS_TYPE(type)) {
@@ -617,7 +760,8 @@ main(int argc, char **argv)
                 * '-c' is invalid with any other options.
                 */
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0) {
+                   level != 0 || record.zi_func[0] != '\0' ||
+                   record.zi_duration != 0) {
                        (void) fprintf(stderr, "cancel (-c) incompatible with "
                            "any other options\n");
                        usage();
@@ -649,7 +793,8 @@ main(int argc, char **argv)
                 * for doing injection, so handle it separately here.
                 */
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0) {
+                   level != 0 || record.zi_func[0] != '\0' ||
+                   record.zi_duration != 0) {
                        (void) fprintf(stderr, "device (-d) incompatible with "
                            "data error injection\n");
                        usage();
@@ -672,12 +817,18 @@ main(int argc, char **argv)
                        return (1);
                }
 
+               record.zi_iotype = io_type;
                if (translate_device(pool, device, label, &record) != 0)
                        return (1);
                if (!error)
                        error = ENXIO;
+
+               if (action != VDEV_STATE_UNKNOWN)
+                       return (perform_action(pool, &record, action));
+
        } else if (raw != NULL) {
-               if (range != NULL || type != TYPE_INVAL || level != 0) {
+               if (range != NULL || type != TYPE_INVAL || level != 0 ||
+                   record.zi_func[0] != '\0' || record.zi_duration != 0) {
                        (void) fprintf(stderr, "raw (-b) format with "
                            "any other options\n");
                        usage();
@@ -704,10 +855,52 @@ main(int argc, char **argv)
                        return (1);
                if (!error)
                        error = EIO;
+       } else if (record.zi_func[0] != '\0') {
+               if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+                   level != 0 || device != NULL || record.zi_duration != 0) {
+                       (void) fprintf(stderr, "panic (-p) incompatible with "
+                           "other options\n");
+                       usage();
+                       return (2);
+               }
+
+               if (argc < 1 || argc > 2) {
+                       (void) fprintf(stderr, "panic (-p) injection requires "
+                           "a single pool name and an optional id\n");
+                       usage();
+                       return (2);
+               }
+
+               (void) strcpy(pool, argv[0]);
+               if (argv[1] != NULL)
+                       record.zi_type = atoi(argv[1]);
+               dataset[0] = '\0';
+       } else if (record.zi_duration != 0) {
+               if (nowrites == 0) {
+                       (void) fprintf(stderr, "-s or -g meaningless "
+                           "without -I (ignore writes)\n");
+                       usage();
+                       return (2);
+               } else if (dur_secs && dur_txg) {
+                       (void) fprintf(stderr, "choose a duration either "
+                           "in seconds (-s) or a number of txgs (-g) "
+                           "but not both\n");
+                       usage();
+                       return (2);
+               } else if (argc != 1) {
+                       (void) fprintf(stderr, "ignore writes (-I) "
+                           "injection requires a single pool name\n");
+                       usage();
+                       return (2);
+               }
+
+               (void) strcpy(pool, argv[0]);
+               dataset[0] = '\0';
        } else if (type == TYPE_INVAL) {
                if (flags == 0) {
                        (void) fprintf(stderr, "at least one of '-b', '-d', "
-                           "'-t', '-a', or '-u' must be specified\n");
+                           "'-t', '-a', '-p', '-I' or '-u' "
+                           "must be specified\n");
                        usage();
                        return (2);
                }
index adc3efe..46fdcad 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _ZINJECT_H
 #define        _ZINJECT_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_ioctl.h>
 
 #ifdef __cplusplus
@@ -41,11 +38,13 @@ typedef enum {
        TYPE_MOSDIR,            /* MOS object directory         */
        TYPE_METASLAB,          /* metaslab objects             */
        TYPE_CONFIG,            /* MOS config                   */
-       TYPE_BPLIST,            /* block pointer list           */
+       TYPE_BPOBJ,             /* block pointer list           */
        TYPE_SPACEMAP,          /* space map objects            */
        TYPE_ERRLOG,            /* persistent error log         */
        TYPE_LABEL_UBERBLOCK,   /* label specific uberblock     */
        TYPE_LABEL_NVLIST,      /* label specific nvlist        */
+       TYPE_LABEL_PAD1,        /* label specific 8K pad1 area  */
+       TYPE_LABEL_PAD2,        /* label specific 8K pad2 area  */
        TYPE_INVAL
 } err_type_t;
 
index c9b092e..62c4be8 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <assert.h>
@@ -42,7 +41,6 @@
 #include <pwd.h>
 #include <zone.h>
 #include <sys/fs/zfs.h>
-
 #include <sys/stat.h>
 
 #include <libzfs.h>
@@ -50,6 +48,8 @@
 #include "zpool_util.h"
 #include "zfs_comutil.h"
 
+#include "statcommon.h"
+
 static int zpool_do_create(int, char **);
 static int zpool_do_destroy(int, char **);
 
@@ -67,6 +67,7 @@ static int zpool_do_clear(int, char **);
 static int zpool_do_attach(int, char **);
 static int zpool_do_detach(int, char **);
 static int zpool_do_replace(int, char **);
+static int zpool_do_split(int, char **);
 
 static int zpool_do_scrub(int, char **);
 
@@ -119,7 +120,8 @@ typedef enum {
        HELP_STATUS,
        HELP_UPGRADE,
        HELP_GET,
-       HELP_SET
+       HELP_SET,
+       HELP_SPLIT
 } zpool_help_t;
 
 
@@ -156,6 +158,7 @@ static zpool_command_t command_table[] = {
        { "attach",     zpool_do_attach,        HELP_ATTACH             },
        { "detach",     zpool_do_detach,        HELP_DETACH             },
        { "replace",    zpool_do_replace,       HELP_REPLACE            },
+       { "split",      zpool_do_split,         HELP_SPLIT              },
        { NULL },
        { "scrub",      zpool_do_scrub,         HELP_SCRUB              },
        { NULL },
@@ -173,6 +176,8 @@ static zpool_command_t command_table[] = {
 zpool_command_t *current_command;
 static char history_str[HIS_MAX_RECORD_LEN];
 
+static uint_t timestamp_fmt = NODATE;
+
 static const char *
 get_usage(zpool_help_t idx) {
        switch (idx) {
@@ -182,7 +187,7 @@ get_usage(zpool_help_t idx) {
                return (gettext("\tattach [-f] <pool> <device> "
                    "<new-device>\n"));
        case HELP_CLEAR:
-               return (gettext("\tclear <pool> [device]\n"));
+               return (gettext("\tclear [-nF] <pool> [device]\n"));
        case HELP_CREATE:
                return (gettext("\tcreate [-fn] [-o property=value] ... \n"
                    "\t    [-O file-system-property=value] ... \n"
@@ -197,17 +202,18 @@ get_usage(zpool_help_t idx) {
                return (gettext("\thistory [-il] [<pool>] ...\n"));
        case HELP_IMPORT:
                return (gettext("\timport [-d dir] [-D]\n"
+                   "\timport [-d dir | -c cachefile] [-n] -F <pool | id>\n"
                    "\timport [-o mntopts] [-o property=value] ... \n"
                    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n"
                    "\timport [-o mntopts] [-o property=value] ... \n"
                    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] "
                    "<pool | id> [newpool]\n"));
        case HELP_IOSTAT:
-               return (gettext("\tiostat [-v] [pool] ... [interval "
+               return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
                    "[count]]\n"));
        case HELP_LIST:
                return (gettext("\tlist [-H] [-o property[,...]] "
-                   "[pool] ...\n"));
+                   "[-T d|u] [pool] ... [interval [count]]\n"));
        case HELP_OFFLINE:
                return (gettext("\toffline [-t] <pool> <device> ...\n"));
        case HELP_ONLINE:
@@ -220,7 +226,8 @@ get_usage(zpool_help_t idx) {
        case HELP_SCRUB:
                return (gettext("\tscrub [-s] <pool> ...\n"));
        case HELP_STATUS:
-               return (gettext("\tstatus [-vx] [pool] ...\n"));
+               return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval "
+                   "[count]]\n"));
        case HELP_UPGRADE:
                return (gettext("\tupgrade\n"
                    "\tupgrade -v\n"
@@ -230,6 +237,10 @@ get_usage(zpool_help_t idx) {
                    "<pool> ...\n"));
        case HELP_SET:
                return (gettext("\tset <property=value> <pool> \n"));
+       case HELP_SPLIT:
+               return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n"
+                   "\t    [-o property=value] <pool> <newpool> "
+                   "[<device> ...]\n"));
        }
 
        abort();
@@ -245,12 +256,12 @@ print_prop_cb(int prop, void *cb)
 {
        FILE *fp = cb;
 
-       (void) fprintf(fp, "\t%-13s  ", zpool_prop_to_name(prop));
+       (void) fprintf(fp, "\t%-15s  ", zpool_prop_to_name(prop));
 
        if (zpool_prop_readonly(prop))
                (void) fprintf(fp, "  NO   ");
        else
-               (void) fprintf(fp, " YES    ");
+               (void) fprintf(fp, " YES   ");
 
        if (zpool_prop_values(prop) == NULL)
                (void) fprintf(fp, "-\n");
@@ -297,7 +308,7 @@ usage(boolean_t requested)
                (void) fprintf(fp,
                    gettext("\nthe following properties are supported:\n"));
 
-               (void) fprintf(fp, "\n\t%-13s  %s  %s\n\n",
+               (void) fprintf(fp, "\n\t%-15s  %s   %s\n\n",
                    "PROPERTY", "EDIT", "VALUES");
 
                /* Iterate over all properties */
@@ -339,7 +350,7 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
                if ((is_log && !print_logs) || (!is_log && print_logs))
                        continue;
 
-               vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+               vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
                print_vdev_tree(zhp, vname, child[c], indent + 2,
                    B_FALSE);
                free(vname);
@@ -507,11 +518,10 @@ zpool_do_add(int argc, char **argv)
 }
 
 /*
- * zpool remove <pool> <vdev> ...
+ * zpool remove  <pool> <vdev> ...
  *
- * Removes the given vdev from the pool.  Currently, this only supports removing
- * spares and cache devices from the pool.  Eventually, we'll want to support
- * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs.
+ * Removes the given vdev from the pool.  Currently, this supports removing
+ * spares, cache, and log devices from the pool.
  */
 int
 zpool_do_remove(int argc, char **argv)
@@ -940,7 +950,7 @@ zpool_do_export(int argc, char **argv)
 static int
 max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
 {
-       char *name = zpool_vdev_name(g_zfs, zhp, nv);
+       char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE);
        nvlist_t **child;
        uint_t c, children;
        int ret;
@@ -1032,20 +1042,21 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
 {
        nvlist_t **child;
        uint_t c, children;
+       pool_scan_stat_t *ps = NULL;
        vdev_stat_t *vs;
-       char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+       char rbuf[6], wbuf[6], cbuf[6];
        char *vname;
        uint64_t notpresent;
        spare_cbdata_t cb;
        char *state;
 
-       verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
-           (uint64_t **)&vs, &c) == 0);
-
        if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
            &child, &children) != 0)
                children = 0;
 
+       verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+           (uint64_t **)&vs, &c) == 0);
+
        state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
        if (isspare) {
                /*
@@ -1123,31 +1134,43 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
                        (void) printf(gettext("bad intent log"));
                        break;
 
+               case VDEV_AUX_EXTERNAL:
+                       (void) printf(gettext("external device fault"));
+                       break;
+
+               case VDEV_AUX_SPLIT_POOL:
+                       (void) printf(gettext("split into new pool"));
+                       break;
+
                default:
                        (void) printf(gettext("corrupted data"));
                        break;
                }
-       } else if (vs->vs_scrub_repaired != 0 && children == 0) {
-               /*
-                * Report bytes resilvered/repaired on leaf devices.
-                */
-               zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
-               (void) printf(gettext("  %s %s"), repaired,
-                   (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
-                   "resilvered" : "repaired");
+       }
+
+       (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
+           (uint64_t **)&ps, &c);
+
+       if (ps && ps->pss_state == DSS_SCANNING &&
+           vs->vs_scan_processed != 0 && children == 0) {
+               (void) printf(gettext("  (%s)"),
+                   (ps->pss_func == POOL_SCAN_RESILVER) ?
+                   "resilvering" : "repairing");
        }
 
        (void) printf("\n");
 
        for (c = 0; c < children; c++) {
-               uint64_t is_log = B_FALSE;
+               uint64_t islog = B_FALSE, ishole = B_FALSE;
 
-               /* Don't print logs here */
+               /* Don't print logs or holes here */
                (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-                   &is_log);
-               if (is_log)
+                   &islog);
+               (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+                   &ishole);
+               if (islog || ishole)
                        continue;
-               vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+               vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
                print_status_config(zhp, vname, child[c],
                    namewidth, depth + 2, isspare);
                free(vname);
@@ -1168,10 +1191,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
        char *type, *vname;
 
        verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
-       if (strcmp(type, VDEV_TYPE_MISSING) == 0)
+       if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
+           strcmp(type, VDEV_TYPE_HOLE) == 0)
                return;
 
-       verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+       verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
            (uint64_t **)&vs, &c) == 0);
 
        (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
@@ -1220,7 +1244,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
                if (is_log)
                        continue;
 
-               vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+               vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE);
                print_import_config(vname, child[c], namewidth, depth + 2);
                free(vname);
        }
@@ -1229,7 +1253,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
            &child, &children) == 0) {
                (void) printf(gettext("\tcache\n"));
                for (c = 0; c < children; c++) {
-                       vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+                       vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
                        (void) printf("\t  %s\n", vname);
                        free(vname);
                }
@@ -1239,7 +1263,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
            &child, &children) == 0) {
                (void) printf(gettext("\tspares\n"));
                for (c = 0; c < children; c++) {
-                       vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+                       vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
                        (void) printf("\t  %s\n", vname);
                        free(vname);
                }
@@ -1274,7 +1298,7 @@ print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
                    &is_log);
                if (!is_log)
                        continue;
-               name = zpool_vdev_name(g_zfs, zhp, child[c]);
+               name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
                if (verbose)
                        print_status_config(zhp, name, child[c], namewidth,
                            2, B_FALSE);
@@ -1283,6 +1307,7 @@ print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
                free(name);
        }
 }
+
 /*
  * Display the status for the given pool.
  */
@@ -1309,7 +1334,7 @@ show_import(nvlist_t *config)
        verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
            &nvroot) == 0);
 
-       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
            (uint64_t **)&vs, &vsc) == 0);
        health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 
@@ -1376,6 +1401,11 @@ show_import(nvlist_t *config)
                    "read.\n"));
                break;
 
+       case ZPOOL_STATUS_RESILVERING:
+               (void) printf(gettext("status: One or more devices were being "
+                   "resilvered.\n"));
+               break;
+
        default:
                /*
                 * No other status can be seen when importing pools.
@@ -1475,7 +1505,6 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
        char *name;
        uint64_t state;
        uint64_t version;
-       int error = 0;
 
        verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
            &name) == 0);
@@ -1538,7 +1567,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
        }
 
        zpool_close(zhp);
-       return (error);
+       return (0);
 }
 
 /*
@@ -1546,7 +1575,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
  *              [-d dir | -c cachefile] [-f] -a
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
- *              [-d dir | -c cachefile] [-f] <pool | id> [newpool]
+ *              [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
  *
  *      -c     Read pool information from a cachefile instead of searching
  *             devices.
@@ -1561,14 +1590,18 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *             the given root.  The pool will remain exported when the machine
  *             is rebooted.
  *
- *       -f    Force import, even if it appears that the pool is active.
- *
- *       -F    Import even in the presence of faulted vdevs.  This is an
+ *       -V    Import even in the presence of faulted vdevs.  This is an
  *             intentionally undocumented option for testing purposes, and
  *             treats the pool configuration as complete, leaving any bad
  *             vdevs in the FAULTED state. In other words, it does verbatim
  *             import.
  *
+ *       -f    Force import, even if it appears that the pool is active.
+ *
+ *       -F     Attempt rewind if necessary.
+ *
+ *       -n     See if rewind would work, but don't actually rewind.
+ *
  *       -a    Import all pools found.
  *
  *       -o    Set property=value and/or temporary mount options (without '=').
@@ -1582,7 +1615,7 @@ zpool_do_import(int argc, char **argv)
        char **searchdirs = NULL;
        int nsearch = 0;
        int c;
-       int err;
+       int err = 0;
        nvlist_t *pools = NULL;
        boolean_t do_all = B_FALSE;
        boolean_t do_destroyed = B_FALSE;
@@ -1594,14 +1627,20 @@ zpool_do_import(int argc, char **argv)
        char *searchname = NULL;
        char *propval;
        nvlist_t *found_config;
+       nvlist_t *policy = NULL;
        nvlist_t *props = NULL;
        boolean_t first;
        boolean_t do_verbatim = B_FALSE;
+       uint32_t rewind_policy = ZPOOL_NO_REWIND;
+       boolean_t dryrun = B_FALSE;
+       boolean_t do_rewind = B_FALSE;
+       boolean_t xtreme_rewind = B_FALSE;
        uint64_t pool_state;
        char *cachefile = NULL;
+       importargs_t idata = { 0 };
 
        /* check options */
-       while ((c = getopt(argc, argv, ":ac:d:DfFo:p:R:")) != -1) {
+       while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) {
                switch (c) {
                case 'a':
                        do_all = B_TRUE;
@@ -1629,7 +1668,10 @@ zpool_do_import(int argc, char **argv)
                        do_force = B_TRUE;
                        break;
                case 'F':
-                       do_verbatim = B_TRUE;
+                       do_rewind = B_TRUE;
+                       break;
+               case 'n':
+                       dryrun = B_TRUE;
                        break;
                case 'o':
                        if ((propval = strchr(optarg, '=')) != NULL) {
@@ -1654,6 +1696,12 @@ zpool_do_import(int argc, char **argv)
                            ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
                                goto error;
                        break;
+               case 'V':
+                       do_verbatim = B_TRUE;
+                       break;
+               case 'X':
+                       xtreme_rewind = B_TRUE;
+                       break;
                case ':':
                        (void) fprintf(stderr, gettext("missing argument for "
                            "'%c' option\n"), optopt);
@@ -1674,6 +1722,23 @@ zpool_do_import(int argc, char **argv)
                usage(B_FALSE);
        }
 
+       if ((dryrun || xtreme_rewind) && !do_rewind) {
+               (void) fprintf(stderr,
+                   gettext("-n or -X only meaningful with -F\n"));
+               usage(B_FALSE);
+       }
+       if (dryrun)
+               rewind_policy = ZPOOL_TRY_REWIND;
+       else if (do_rewind)
+               rewind_policy = ZPOOL_DO_REWIND;
+       if (xtreme_rewind)
+               rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+       /* In the future, we can capture further policy and include it here */
+       if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+           nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
+               goto error;
+
        if (searchdirs == NULL) {
                searchdirs = safe_malloc(sizeof (char *));
                searchdirs[0] = "/dev/dsk";
@@ -1701,6 +1766,7 @@ zpool_do_import(int argc, char **argv)
                        (void) fprintf(stderr, gettext("cannot "
                            "discover pools: permission denied\n"));
                        free(searchdirs);
+                       nvlist_free(policy);
                        return (1);
                }
        }
@@ -1726,28 +1792,49 @@ zpool_do_import(int argc, char **argv)
                if (errno != 0 || *endptr != '\0')
                        searchname = argv[0];
                found_config = NULL;
-       }
 
-       if (cachefile) {
-               pools = zpool_find_import_cached(g_zfs, cachefile, searchname,
-                   searchguid);
-       } else if (searchname != NULL) {
-               pools = zpool_find_import_byname(g_zfs, nsearch, searchdirs,
-                   searchname);
-       } else {
                /*
-                * It's OK to search by guid even if searchguid is 0.
+                * User specified a name or guid.  Ensure it's unique.
                 */
-               pools = zpool_find_import_byguid(g_zfs, nsearch, searchdirs,
-                   searchguid);
-       }
-
-       if (pools == NULL) {
+               idata.unique = B_TRUE;
+       }
+
+
+       idata.path = searchdirs;
+       idata.paths = nsearch;
+       idata.poolname = searchname;
+       idata.guid = searchguid;
+       idata.cachefile = cachefile;
+
+       pools = zpool_search_import(g_zfs, &idata);
+
+       if (pools != NULL && idata.exists &&
+           (argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
+               (void) fprintf(stderr, gettext("cannot import '%s': "
+                   "a pool with that name already exists\n"),
+                   argv[0]);
+               (void) fprintf(stderr, gettext("use the form '%s "
+                   "<pool | id> <newpool>' to give it a new name\n"),
+                   "zpool import");
+               err = 1;
+       } else if (pools == NULL && idata.exists) {
+               (void) fprintf(stderr, gettext("cannot import '%s': "
+                   "a pool with that name is already created/imported,\n"),
+                   argv[0]);
+               (void) fprintf(stderr, gettext("and no additional pools "
+                   "with that name were found\n"));
+               err = 1;
+       } else if (pools == NULL) {
                if (argc != 0) {
                        (void) fprintf(stderr, gettext("cannot import '%s': "
                            "no such pool available\n"), argv[0]);
                }
+               err = 1;
+       }
+
+       if (err == 1) {
                free(searchdirs);
+               nvlist_free(policy);
                return (1);
        }
 
@@ -1771,17 +1858,21 @@ zpool_do_import(int argc, char **argv)
                if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
                        continue;
 
+               verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
+                   policy) == 0);
+
                if (argc == 0) {
                        if (first)
                                first = B_FALSE;
                        else if (!do_all)
                                (void) printf("\n");
 
-                       if (do_all)
+                       if (do_all) {
                                err |= do_import(config, NULL, mntopts,
                                    do_force, props, do_verbatim);
-                       else
+                       } else {
                                show_import(config);
+                       }
                } else if (searchname != NULL) {
                        char *name;
 
@@ -1842,6 +1933,7 @@ zpool_do_import(int argc, char **argv)
 error:
        nvlist_free(props);
        nvlist_free(pools);
+       nvlist_free(policy);
        free(searchdirs);
 
        return (err ? 1 : 0);
@@ -1869,7 +1961,7 @@ print_iostat_header(iostat_cbdata_t *cb)
 {
        (void) printf("%*s     capacity     operations    bandwidth\n",
            cb->cb_namewidth, "");
-       (void) printf("%-*s   used  avail   read  write   read  write\n",
+       (void) printf("%-*s  alloc   free   read  write   read  write\n",
            cb->cb_namewidth, "pool");
        print_iostat_separator(cb);
 }
@@ -1904,13 +1996,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
        char *vname;
 
        if (oldnv != NULL) {
-               verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS,
-                   (uint64_t **)&oldvs, &c) == 0);
+               verify(nvlist_lookup_uint64_array(oldnv,
+                   ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
        } else {
                oldvs = &zerovs;
        }
 
-       verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS,
+       verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
            (uint64_t **)&newvs, &c) == 0);
 
        if (strlen(name) + depth > cb->cb_namewidth)
@@ -1960,7 +2052,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
                return;
 
        for (c = 0; c < children; c++) {
-               vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+               uint64_t ishole = B_FALSE;
+
+               if (nvlist_lookup_uint64(newchild[c],
+                   ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
+                       continue;
+
+               vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
                print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
                    newchild[c], cb, depth + 2);
                free(vname);
@@ -1981,7 +2079,8 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
                (void) printf("%-*s      -      -      -      -      -      "
                    "-\n", cb->cb_namewidth, "cache");
                for (c = 0; c < children; c++) {
-                       vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+                       vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+                           B_FALSE);
                        print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
                            newchild[c], cb, depth + 2);
                        free(vname);
@@ -2070,42 +2169,14 @@ get_namewidth(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool iostat [-v] [pool] ... [interval [count]]
- *
- *     -v      Display statistics for individual vdevs
- *
- * This command can be tricky because we want to be able to deal with pool
- * creation/destruction as well as vdev configuration changes.  The bulk of this
- * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
- * on pool_list_update() to detect the addition of new pools.  Configuration
- * changes are all handled within libzfs.
+ * Parse the input string, get the 'interval' and 'count' value if there is one.
  */
-int
-zpool_do_iostat(int argc, char **argv)
+static void
+get_interval_count(int *argcp, char **argv, unsigned long *iv,
+    unsigned long *cnt)
 {
-       int c;
-       int ret;
-       int npools;
        unsigned long interval = 0, count = 0;
-       zpool_list_t *list;
-       boolean_t verbose = B_FALSE;
-       iostat_cbdata_t cb;
-
-       /* check options */
-       while ((c = getopt(argc, argv, "v")) != -1) {
-               switch (c) {
-               case 'v':
-                       verbose = B_TRUE;
-                       break;
-               case '?':
-                       (void) fprintf(stderr, gettext("invalid option '%c'\n"),
-                           optopt);
-                       usage(B_FALSE);
-               }
-       }
-
-       argc -= optind;
-       argv += optind;
+       int argc = *argcp, errno;
 
        /*
         * Determine if the last argument is an integer or a pool name
@@ -2122,7 +2193,6 @@ zpool_do_iostat(int argc, char **argv)
                                    "cannot be zero\n"));
                                usage(B_FALSE);
                        }
-
                        /*
                         * Ignore the last parameter
                         */
@@ -2139,7 +2209,7 @@ zpool_do_iostat(int argc, char **argv)
 
        /*
         * If the last argument is also an integer, then we have both a count
-        * and an integer.
+        * and an interval.
         */
        if (argc > 0 && isdigit(argv[argc - 1][0])) {
                char *end;
@@ -2164,6 +2234,66 @@ zpool_do_iostat(int argc, char **argv)
                }
        }
 
+       *iv = interval;
+       *cnt = count;
+       *argcp = argc;
+}
+
+static void
+get_timestamp_arg(char c)
+{
+       if (c == 'u')
+               timestamp_fmt = UDATE;
+       else if (c == 'd')
+               timestamp_fmt = DDATE;
+       else
+               usage(B_FALSE);
+}
+
+/*
+ * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]]
+ *
+ *     -v      Display statistics for individual vdevs
+ *     -T      Display a timestamp in date(1) or Unix format
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes.  The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
+ * on pool_list_update() to detect the addition of new pools.  Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+       int c;
+       int ret;
+       int npools;
+       unsigned long interval = 0, count = 0;
+       zpool_list_t *list;
+       boolean_t verbose = B_FALSE;
+       iostat_cbdata_t cb;
+
+       /* check options */
+       while ((c = getopt(argc, argv, "T:v")) != -1) {
+               switch (c) {
+               case 'T':
+                       get_timestamp_arg(*optarg);
+                       break;
+               case 'v':
+                       verbose = B_TRUE;
+                       break;
+               case '?':
+                       (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+                           optopt);
+                       usage(B_FALSE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       get_interval_count(&argc, argv, &interval, &count);
+
        /*
         * Construct the list of all interesting pools.
         */
@@ -2210,6 +2340,9 @@ zpool_do_iostat(int argc, char **argv)
                cb.cb_namewidth = 0;
                (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
 
+               if (timestamp_fmt != NODATE)
+                       print_timestamp(timestamp_fmt);
+
                /*
                 * If it's the first time, or verbose mode, print the header.
                 */
@@ -2361,12 +2494,13 @@ list_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool list [-H] [-o prop[,prop]*] [pool] ...
+ * zpool list [-H] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
  *
  *     -H      Scripted mode.  Don't display headers, and separate properties
  *             by a single tab.
  *     -o      List of properties to display.  Defaults to
- *             "name,size,used,available,capacity,health,altroot"
+ *             "name,size,allocated,free,capacity,health,altroot"
+ *     -T      Display a timestamp in date(1) or Unix format
  *
  * List all pools in the system, whether or not they're healthy.  Output space
  * statistics for each one, as well as health status summary.
@@ -2378,11 +2512,12 @@ zpool_do_list(int argc, char **argv)
        int ret;
        list_cbdata_t cb = { 0 };
        static char default_props[] =
-           "name,size,used,available,capacity,health,altroot";
+           "name,size,allocated,free,capacity,dedupratio,health,altroot";
        char *props = default_props;
+       unsigned long interval = 0, count = 0;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":Ho:")) != -1) {
+       while ((c = getopt(argc, argv, ":Ho:T:")) != -1) {
                switch (c) {
                case 'H':
                        cb.cb_scripted = B_TRUE;
@@ -2390,6 +2525,9 @@ zpool_do_list(int argc, char **argv)
                case 'o':
                        props = optarg;
                        break;
+               case 'T':
+                       get_timestamp_arg(*optarg);
+                       break;
                case ':':
                        (void) fprintf(stderr, gettext("missing argument for "
                            "'%c' option\n"), optopt);
@@ -2405,21 +2543,37 @@ zpool_do_list(int argc, char **argv)
        argc -= optind;
        argv += optind;
 
+       get_interval_count(&argc, argv, &interval, &count);
+
        if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
                usage(B_FALSE);
 
        cb.cb_first = B_TRUE;
 
-       ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
-           list_callback, &cb);
+       for (;;) {
 
-       zprop_free_list(cb.cb_proplist);
+               if (timestamp_fmt != NODATE)
+                       print_timestamp(timestamp_fmt);
 
-       if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
-               (void) printf(gettext("no pools available\n"));
-               return (0);
+               ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
+                   list_callback, &cb);
+
+               if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
+                       (void) printf(gettext("no pools available\n"));
+                       zprop_free_list(cb.cb_proplist);
+                       return (0);
+               }
+
+               if (interval == 0)
+                       break;
+
+               if (count != 0 && --count == 0)
+                       break;
+
+               (void) sleep(interval);
        }
 
+       zprop_free_list(cb.cb_proplist);
        return (ret);
 }
 
@@ -2626,6 +2780,146 @@ zpool_do_detach(int argc, char **argv)
 }
 
 /*
+ * zpool split [-n] [-o prop=val] ...
+ *             [-o mntopt] ...
+ *             [-R altroot] <pool> <newpool> [<device> ...]
+ *
+ *     -n      Do not split the pool, but display the resulting layout if
+ *             it were to be split.
+ *     -o      Set property=value, or set mount options.
+ *     -R      Mount the split-off pool under an alternate root.
+ *
+ * Splits the named pool and gives it the new pool name.  Devices to be split
+ * off may be listed, provided that no more than one device is specified
+ * per top-level vdev mirror.  The newly split pool is left in an exported
+ * state unless -R is specified.
+ *
+ * Restrictions: the top-level of the pool pool must only be made up of
+ * mirrors; all devices in the pool must be healthy; no device may be
+ * undergoing a resilvering operation.
+ */
+int
+zpool_do_split(int argc, char **argv)
+{
+       char *srcpool, *newpool, *propval;
+       char *mntopts = NULL;
+       splitflags_t flags;
+       int c, ret = 0;
+       zpool_handle_t *zhp;
+       nvlist_t *config, *props = NULL;
+
+       flags.dryrun = B_FALSE;
+       flags.import = B_FALSE;
+
+       /* check options */
+       while ((c = getopt(argc, argv, ":R:no:")) != -1) {
+               switch (c) {
+               case 'R':
+                       flags.import = B_TRUE;
+                       if (add_prop_list(
+                           zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
+                           &props, B_TRUE) != 0) {
+                               if (props)
+                                       nvlist_free(props);
+                               usage(B_FALSE);
+                       }
+                       break;
+               case 'n':
+                       flags.dryrun = B_TRUE;
+                       break;
+               case 'o':
+                       if ((propval = strchr(optarg, '=')) != NULL) {
+                               *propval = '\0';
+                               propval++;
+                               if (add_prop_list(optarg, propval,
+                                   &props, B_TRUE) != 0) {
+                                       if (props)
+                                               nvlist_free(props);
+                                       usage(B_FALSE);
+                               }
+                       } else {
+                               mntopts = optarg;
+                       }
+                       break;
+               case ':':
+                       (void) fprintf(stderr, gettext("missing argument for "
+                           "'%c' option\n"), optopt);
+                       usage(B_FALSE);
+                       break;
+               case '?':
+                       (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+                           optopt);
+                       usage(B_FALSE);
+                       break;
+               }
+       }
+
+       if (!flags.import && mntopts != NULL) {
+               (void) fprintf(stderr, gettext("setting mntopts is only "
+                   "valid when importing the pool\n"));
+               usage(B_FALSE);
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 1) {
+               (void) fprintf(stderr, gettext("Missing pool name\n"));
+               usage(B_FALSE);
+       }
+       if (argc < 2) {
+               (void) fprintf(stderr, gettext("Missing new pool name\n"));
+               usage(B_FALSE);
+       }
+
+       srcpool = argv[0];
+       newpool = argv[1];
+
+       argc -= 2;
+       argv += 2;
+
+       if ((zhp = zpool_open(g_zfs, srcpool)) == NULL)
+               return (1);
+
+       config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
+       if (config == NULL) {
+               ret = 1;
+       } else {
+               if (flags.dryrun) {
+                       (void) printf(gettext("would create '%s' with the "
+                           "following layout:\n\n"), newpool);
+                       print_vdev_tree(NULL, newpool, config, 0, B_FALSE);
+               }
+               nvlist_free(config);
+       }
+
+       zpool_close(zhp);
+
+       if (ret != 0 || flags.dryrun || !flags.import)
+               return (ret);
+
+       /*
+        * The split was successful. Now we need to open the new
+        * pool and import it.
+        */
+       if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL)
+               return (1);
+       if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+           zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+               ret = 1;
+               (void) fprintf(stderr, gettext("Split was succssful, but "
+                   "the datasets could not all be mounted\n"));
+               (void) fprintf(stderr, gettext("Try doing '%s' with a "
+                   "different altroot\n"), "zpool import");
+       }
+       zpool_close(zhp);
+
+       return (ret);
+}
+
+
+
+/*
  * zpool online <pool> <device> ...
  */
 int
@@ -2765,31 +3059,80 @@ zpool_do_offline(int argc, char **argv)
 int
 zpool_do_clear(int argc, char **argv)
 {
+       int c;
        int ret = 0;
+       boolean_t dryrun = B_FALSE;
+       boolean_t do_rewind = B_FALSE;
+       boolean_t xtreme_rewind = B_FALSE;
+       uint32_t rewind_policy = ZPOOL_NO_REWIND;
+       nvlist_t *policy = NULL;
        zpool_handle_t *zhp;
        char *pool, *device;
 
-       if (argc < 2) {
+       /* check options */
+       while ((c = getopt(argc, argv, "FnX")) != -1) {
+               switch (c) {
+               case 'F':
+                       do_rewind = B_TRUE;
+                       break;
+               case 'n':
+                       dryrun = B_TRUE;
+                       break;
+               case 'X':
+                       xtreme_rewind = B_TRUE;
+                       break;
+               case '?':
+                       (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+                           optopt);
+                       usage(B_FALSE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 1) {
                (void) fprintf(stderr, gettext("missing pool name\n"));
                usage(B_FALSE);
        }
 
-       if (argc > 3) {
+       if (argc > 2) {
                (void) fprintf(stderr, gettext("too many arguments\n"));
                usage(B_FALSE);
        }
 
-       pool = argv[1];
-       device = argc == 3 ? argv[2] : NULL;
+       if ((dryrun || xtreme_rewind) && !do_rewind) {
+               (void) fprintf(stderr,
+                   gettext("-n or -X only meaningful with -F\n"));
+               usage(B_FALSE);
+       }
+       if (dryrun)
+               rewind_policy = ZPOOL_TRY_REWIND;
+       else if (do_rewind)
+               rewind_policy = ZPOOL_DO_REWIND;
+       if (xtreme_rewind)
+               rewind_policy |= ZPOOL_EXTREME_REWIND;
 
-       if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
+       /* In future, further rewind policy choices can be passed along here */
+       if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+           nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
                return (1);
 
-       if (zpool_clear(zhp, device) != 0)
+       pool = argv[0];
+       device = argc == 2 ? argv[1] : NULL;
+
+       if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+               nvlist_free(policy);
+               return (1);
+       }
+
+       if (zpool_clear(zhp, device, policy) != 0)
                ret = 1;
 
        zpool_close(zhp);
 
+       nvlist_free(policy);
+
        return (ret);
 }
 
@@ -2814,7 +3157,7 @@ scrub_callback(zpool_handle_t *zhp, void *data)
                return (1);
        }
 
-       err = zpool_scrub(zhp, cb->cb_type);
+       err = zpool_scan(zhp, cb->cb_type);
 
        return (err != 0);
 }
@@ -2830,13 +3173,13 @@ zpool_do_scrub(int argc, char **argv)
        int c;
        scrub_cbdata_t cb;
 
-       cb.cb_type = POOL_SCRUB_EVERYTHING;
+       cb.cb_type = POOL_SCAN_SCRUB;
 
        /* check options */
        while ((c = getopt(argc, argv, "s")) != -1) {
                switch (c) {
                case 's':
-                       cb.cb_type = POOL_SCRUB_NONE;
+                       cb.cb_type = POOL_SCAN_NONE;
                        break;
                case '?':
                        (void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -2864,68 +3207,110 @@ typedef struct status_cbdata {
        boolean_t       cb_verbose;
        boolean_t       cb_explain;
        boolean_t       cb_first;
+       boolean_t       cb_dedup_stats;
 } status_cbdata_t;
 
 /*
  * Print out detailed scrub status.
  */
 void
-print_scrub_status(nvlist_t *nvroot)
+print_scan_status(pool_scan_stat_t *ps)
 {
-       vdev_stat_t *vs;
-       uint_t vsc;
-       time_t start, end, now;
+       time_t start, end;
+       uint64_t elapsed, mins_left;
+       uint64_t pass_exam, examined, total;
+       uint_t rate;
        double fraction_done;
-       uint64_t examined, total, minutes_left, minutes_taken;
-       char *scrub_type;
+       char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
 
-       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
-           (uint64_t **)&vs, &vsc) == 0);
+       (void) printf(gettext(" scan: "));
 
-       /*
-        * If there's never been a scrub, there's not much to say.
-        */
-       if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) {
+       /* If there's never been a scan, there's not much to say. */
+       if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
+           ps->pss_func >= POOL_SCAN_FUNCS) {
                (void) printf(gettext("none requested\n"));
                return;
        }
 
-       scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
-           "resilver" : "scrub";
-
-       start = vs->vs_scrub_start;
-       end = vs->vs_scrub_end;
-       now = time(NULL);
-       examined = vs->vs_scrub_examined;
-       total = vs->vs_alloc;
+       start = ps->pss_start_time;
+       end = ps->pss_end_time;
+       zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
 
-       if (end != 0) {
-               minutes_taken = (uint64_t)((end - start) / 60);
-
-               (void) printf(gettext("%s %s after %lluh%um with %llu errors "
-                   "on %s"),
-                   scrub_type, vs->vs_scrub_complete ? "completed" : "stopped",
+       assert(ps->pss_func == POOL_SCAN_SCRUB ||
+           ps->pss_func == POOL_SCAN_RESILVER);
+       /*
+        * Scan is finished or canceled.
+        */
+       if (ps->pss_state == DSS_FINISHED) {
+               uint64_t minutes_taken = (end - start) / 60;
+               char *fmt;
+
+               if (ps->pss_func == POOL_SCAN_SCRUB) {
+                       fmt = gettext("scrub repaired %s in %lluh%um with "
+                           "%llu errors on %s");
+               } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+                       fmt = gettext("resilvered %s in %lluh%um with "
+                           "%llu errors on %s");
+               }
+               /* LINTED */
+               (void) printf(fmt, processed_buf,
                    (u_longlong_t)(minutes_taken / 60),
                    (uint_t)(minutes_taken % 60),
-                   (u_longlong_t)vs->vs_scrub_errors, ctime(&end));
+                   (u_longlong_t)ps->pss_errors,
+                   ctime((time_t *)&end));
+               return;
+       } else if (ps->pss_state == DSS_CANCELED) {
+               if (ps->pss_func == POOL_SCAN_SCRUB) {
+                       (void) printf(gettext("scrub canceled on %s"),
+                           ctime(&end));
+               } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+                       (void) printf(gettext("resilver canceled on %s"),
+                           ctime(&end));
+               }
                return;
        }
 
-       if (examined == 0)
-               examined = 1;
-       if (examined > total)
-               total = examined;
+       assert(ps->pss_state == DSS_SCANNING);
+
+       /*
+        * Scan is in progress.
+        */
+       if (ps->pss_func == POOL_SCAN_SCRUB) {
+               (void) printf(gettext("scrub in progress since %s"),
+                   ctime(&start));
+       } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+               (void) printf(gettext("resilver in progress since %s"),
+                   ctime(&start));
+       }
 
+       examined = ps->pss_examined ? ps->pss_examined : 1;
+       total = ps->pss_to_examine;
        fraction_done = (double)examined / total;
-       minutes_left = (uint64_t)((now - start) *
-           (1 - fraction_done) / fraction_done / 60);
-       minutes_taken = (uint64_t)((now - start) / 60);
 
-       (void) printf(gettext("%s in progress for %lluh%um, %.2f%% done, "
-           "%lluh%um to go\n"),
-           scrub_type, (u_longlong_t)(minutes_taken / 60),
-           (uint_t)(minutes_taken % 60), 100 * fraction_done,
-           (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
+       /* elapsed time for this pass */
+       elapsed = time(NULL) - ps->pss_pass_start;
+       elapsed = elapsed ? elapsed : 1;
+       pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
+       rate = pass_exam / elapsed;
+       rate = rate ? rate : 1;
+       mins_left = ((total - examined) / rate) / 60;
+
+       zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
+       zfs_nicenum(total, total_buf, sizeof (total_buf));
+       zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+       (void) printf(gettext("    %s scanned out of %s at "
+           "%s/s, %lluh%um to go\n"), examined_buf, total_buf, rate_buf,
+           (u_longlong_t)(mins_left / 60),
+           (uint_t)(mins_left % 60));
+
+       if (ps->pss_func == POOL_SCAN_RESILVER) {
+               (void) printf(gettext("    %s resilvered, %.2f%% done\n"),
+                   processed_buf, 100 * fraction_done);
+       } else if (ps->pss_func == POOL_SCAN_SCRUB) {
+               (void) printf(gettext("    %s repaired, %.2f%% done\n"),
+                   processed_buf, 100 * fraction_done);
+       }
 }
 
 static void
@@ -2976,7 +3361,7 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
        (void) printf(gettext("\tspares\n"));
 
        for (i = 0; i < nspares; i++) {
-               name = zpool_vdev_name(g_zfs, zhp, spares[i]);
+               name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE);
                print_status_config(zhp, name, spares[i],
                    namewidth, 2, B_TRUE);
                free(name);
@@ -2996,13 +3381,43 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
        (void) printf(gettext("\tcache\n"));
 
        for (i = 0; i < nl2cache; i++) {
-               name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
+               name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE);
                print_status_config(zhp, name, l2cache[i],
                    namewidth, 2, B_FALSE);
                free(name);
        }
 }
 
+static void
+print_dedup_stats(nvlist_t *config)
+{
+       ddt_histogram_t *ddh;
+       ddt_stat_t *dds;
+       ddt_object_t *ddo;
+       uint_t c;
+
+       /*
+        * If the pool was faulted then we may not have been able to
+        * obtain the config. Otherwise, if have anything in the dedup
+        * table continue processing the stats.
+        */
+       if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
+           (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0)
+               return;
+
+       (void) printf("\n");
+       (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
+           (u_longlong_t)ddo->ddo_count,
+           (u_longlong_t)ddo->ddo_dspace,
+           (u_longlong_t)ddo->ddo_mspace);
+
+       verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
+           (uint64_t **)&dds, &c) == 0);
+       verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
+           (uint64_t **)&ddh, &c) == 0);
+       zpool_dump_ddt(dds, ddh);
+}
+
 /*
  * Display a summary of pool status.  Displays a summary such as:
  *
@@ -3055,7 +3470,7 @@ status_callback(zpool_handle_t *zhp, void *data)
 
        verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
            &nvroot) == 0);
-       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
            (uint64_t **)&vs, &c) == 0);
        health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 
@@ -3093,8 +3508,8 @@ status_callback(zpool_handle_t *zhp, void *data)
                    "be used because the label is missing \n\tor invalid.  "
                    "There are insufficient replicas for the pool to "
                    "continue\n\tfunctioning.\n"));
-               (void) printf(gettext("action: Destroy and re-create the pool "
-                   "from a backup source.\n"));
+               zpool_explain_recover(zpool_get_handle(zhp),
+                   zpool_get_name(zhp), reason, config);
                break;
 
        case ZPOOL_STATUS_FAILING_DEV:
@@ -3128,7 +3543,6 @@ status_callback(zpool_handle_t *zhp, void *data)
                    "replace'.\n"));
                break;
 
-
        case ZPOOL_STATUS_RESILVERING:
                (void) printf(gettext("status: One or more devices is "
                    "currently being resilvered.  The pool will\n\tcontinue "
@@ -3149,8 +3563,8 @@ status_callback(zpool_handle_t *zhp, void *data)
        case ZPOOL_STATUS_CORRUPT_POOL:
                (void) printf(gettext("status: The pool metadata is corrupted "
                    "and the pool cannot be opened.\n"));
-               (void) printf(gettext("action: Destroy and re-create the pool "
-                   "from a backup source.\n"));
+               zpool_explain_recover(zpool_get_handle(zhp),
+                   zpool_get_name(zhp), reason, config);
                break;
 
        case ZPOOL_STATUS_VERSION_OLDER:
@@ -3226,10 +3640,11 @@ status_callback(zpool_handle_t *zhp, void *data)
                uint64_t nerr;
                nvlist_t **spares, **l2cache;
                uint_t nspares, nl2cache;
+               pool_scan_stat_t *ps = NULL;
 
-
-               (void) printf(gettext(" scrub: "));
-               print_scrub_status(nvroot);
+               (void) nvlist_lookup_uint64_array(nvroot,
+                   ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
+               print_scan_status(ps);
 
                namewidth = max_width(zhp, nvroot, 0, 0);
                if (namewidth < 10)
@@ -3285,6 +3700,9 @@ status_callback(zpool_handle_t *zhp, void *data)
                        else
                                print_error_log(zhp);
                }
+
+               if (cbp->cb_dedup_stats)
+                       print_dedup_stats(config);
        } else {
                (void) printf(gettext("config: The configuration cannot be "
                    "determined.\n"));
@@ -3294,10 +3712,12 @@ status_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool status [-vx] [pool] ...
+ * zpool status [-vx] [-T d|u] [pool] ... [interval [count]]
  *
  *     -v      Display complete error logs
  *     -x      Display only pools with potential problems
+ *     -D      Display dedup status (undocumented)
+ *     -T      Display a timestamp in date(1) or Unix format
  *
  * Describes the health status of all pools or some subset.
  */
@@ -3306,10 +3726,11 @@ zpool_do_status(int argc, char **argv)
 {
        int c;
        int ret;
+       unsigned long interval = 0, count = 0;
        status_cbdata_t cb = { 0 };
 
        /* check options */
-       while ((c = getopt(argc, argv, "vx")) != -1) {
+       while ((c = getopt(argc, argv, "vxDT:")) != -1) {
                switch (c) {
                case 'v':
                        cb.cb_verbose = B_TRUE;
@@ -3317,6 +3738,12 @@ zpool_do_status(int argc, char **argv)
                case 'x':
                        cb.cb_explain = B_TRUE;
                        break;
+               case 'D':
+                       cb.cb_dedup_stats = B_TRUE;
+                       break;
+               case 'T':
+                       get_timestamp_arg(*optarg);
+                       break;
                case '?':
                        (void) fprintf(stderr, gettext("invalid option '%c'\n"),
                            optopt);
@@ -3327,19 +3754,38 @@ zpool_do_status(int argc, char **argv)
        argc -= optind;
        argv += optind;
 
-       cb.cb_first = B_TRUE;
+       get_interval_count(&argc, argv, &interval, &count);
 
        if (argc == 0)
                cb.cb_allpools = B_TRUE;
 
-       ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb);
+       cb.cb_first = B_TRUE;
 
-       if (argc == 0 && cb.cb_count == 0)
-               (void) printf(gettext("no pools available\n"));
-       else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
-               (void) printf(gettext("all pools are healthy\n"));
+       for (;;) {
+               if (timestamp_fmt != NODATE)
+                       print_timestamp(timestamp_fmt);
 
-       return (ret);
+               ret = for_each_pool(argc, argv, B_TRUE, NULL,
+                   status_callback, &cb);
+
+               if (argc == 0 && cb.cb_count == 0)
+                       (void) printf(gettext("no pools available\n"));
+               else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+                       (void) printf(gettext("all pools are healthy\n"));
+
+               if (ret != 0)
+                       return (ret);
+
+               if (interval == 0)
+                       break;
+
+               if (count != 0 && --count == 0)
+                       break;
+
+               (void) sleep(interval);
+       }
+
+       return (0);
 }
 
 typedef struct upgrade_cbdata {
@@ -3552,12 +3998,20 @@ zpool_do_upgrade(int argc, char **argv)
                (void) printf(gettext(" 15  user/group space accounting\n"));
                (void) printf(gettext(" 16  stmf property support\n"));
                (void) printf(gettext(" 17  Triple-parity RAID-Z\n"));
-               (void) printf(gettext(" 18  snapshot user holds\n"));
-               (void) printf(gettext("For more information on a particular "
-                   "version, including supported releases, see:\n\n"));
-               (void) printf("http://www.opensolaris.org/os/community/zfs/"
-                   "version/N\n\n");
-               (void) printf(gettext("Where 'N' is the version number.\n"));
+               (void) printf(gettext(" 18  Snapshot user holds\n"));
+               (void) printf(gettext(" 19  Log device removal\n"));
+               (void) printf(gettext(" 20  Compression using zle "
+                   "(zero-length encoding)\n"));
+               (void) printf(gettext(" 21  Deduplication\n"));
+               (void) printf(gettext(" 22  Received properties\n"));
+               (void) printf(gettext(" 23  Slim ZIL\n"));
+               (void) printf(gettext(" 24  System attributes\n"));
+               (void) printf(gettext(" 25  Improved scrub stats\n"));
+               (void) printf(gettext(" 26  Improved snapshot deletion "
+                   "performance\n"));
+               (void) printf(gettext("\nFor more information on a particular "
+                   "version, including supported releases,\n"));
+               (void) printf(gettext("see the ZFS Administration Guide.\n\n"));
        } else if (argc == 0) {
                int notfound;
 
@@ -3599,49 +4053,6 @@ typedef struct hist_cbdata {
        int internal;
 } hist_cbdata_t;
 
-char *hist_event_table[LOG_END] = {
-       "invalid event",
-       "pool create",
-       "vdev add",
-       "pool remove",
-       "pool destroy",
-       "pool export",
-       "pool import",
-       "vdev attach",
-       "vdev replace",
-       "vdev detach",
-       "vdev online",
-       "vdev offline",
-       "vdev upgrade",
-       "pool clear",
-       "pool scrub",
-       "pool property set",
-       "create",
-       "clone",
-       "destroy",
-       "destroy_begin_sync",
-       "inherit",
-       "property set",
-       "quota set",
-       "permission update",
-       "permission remove",
-       "permission who remove",
-       "promote",
-       "receive",
-       "rename",
-       "reservation set",
-       "replay_inc_sync",
-       "replay_full_sync",
-       "rollback",
-       "snapshot",
-       "filesystem version upgrade",
-       "refquota set",
-       "refreservation set",
-       "pool scrub done",
-       "user hold",
-       "user release",
-};
-
 /*
  * Print out the command history for a specific pool.
  */
@@ -3699,7 +4110,7 @@ get_history_one(zpool_handle_t *zhp, void *data)
                        (void) snprintf(internalstr,
                            sizeof (internalstr),
                            "[internal %s txg:%lld] %s",
-                           hist_event_table[ievent], txg,
+                           zfs_history_event_names[ievent], txg,
                            pathstr);
                        cmdstr = internalstr;
                }
@@ -3811,7 +4222,8 @@ get_callback(zpool_handle_t *zhp, void *data)
                        continue;
 
                zprop_print_one_property(zpool_get_name(zhp), cbp,
-                   zpool_prop_to_name(pl->pl_prop), value, srctype, NULL);
+                   zpool_prop_to_name(pl->pl_prop), value, srctype, NULL,
+                   NULL);
        }
        return (0);
 }
index f44da4f..c7a002e 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <errno.h>
 #include <libgen.h>
 #include <libintl.h>
@@ -51,22 +49,6 @@ safe_malloc(size_t size)
 }
 
 /*
- * Same as above, but for strdup()
- */
-char *
-safe_strdup(const char *str)
-{
-       char *ret;
-
-       if ((ret = strdup(str)) == NULL) {
-               (void) fprintf(stderr, "internal error: out of memory\n");
-               exit(1);
-       }
-
-       return (ret);
-}
-
-/*
  * Display an out of memory error message and abort the current program.
  */
 void
index e82f320..134c730 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        ZPOOL_UTIL_H
@@ -37,7 +36,6 @@ extern "C" {
  * Basic utility functions
  */
 void *safe_malloc(size_t);
-char *safe_strdup(const char *);
 void zpool_no_memory(void);
 uint_t num_logs(nvlist_t *nv);
 
@@ -46,7 +44,9 @@ uint_t num_logs(nvlist_t *nv);
  */
 
 nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
-    boolean_t isreplace, boolean_t dryrun, int argc, char **argv);
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv);
+nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
+    nvlist_t *props, splitflags_t flags, int argc, char **argv);
 
 /*
  * Pool list functions
index 6215191..53c2e60 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -1004,8 +1003,8 @@ is_spare(nvlist_t *config, const char *path)
                return (B_FALSE);
        }
        free(name);
-
        (void) close(fd);
+
        verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
        nvlist_free(label);
 
@@ -1029,8 +1028,8 @@ is_spare(nvlist_t *config, const char *path)
  * the majority of this task.
  */
 static int
-check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
-    int isspare)
+check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+    boolean_t replacing, boolean_t isspare)
 {
        nvlist_t **child;
        uint_t c, children;
@@ -1051,13 +1050,14 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
                 * hot spare within the same pool.  If so, we allow it
                 * regardless of what libdiskmgt or zpool_in_use() says.
                 */
-               if (isreplacing) {
+               if (replacing) {
                        if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
                            &wholedisk) == 0 && wholedisk)
                                (void) snprintf(buf, sizeof (buf), "%ss0",
                                    path);
                        else
                                (void) strlcpy(buf, path, sizeof (buf));
+
                        if (is_spare(config, buf))
                                return (0);
                }
@@ -1073,21 +1073,21 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
 
        for (c = 0; c < children; c++)
                if ((ret = check_in_use(config, child[c], force,
-                   isreplacing, B_FALSE)) != 0)
+                   replacing, B_FALSE)) != 0)
                        return (ret);
 
        if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
            &child, &children) == 0)
                for (c = 0; c < children; c++)
                        if ((ret = check_in_use(config, child[c], force,
-                           isreplacing, B_TRUE)) != 0)
+                           replacing, B_TRUE)) != 0)
                                return (ret);
 
        if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
            &child, &children) == 0)
                for (c = 0; c < children; c++)
                        if ((ret = check_in_use(config, child[c], force,
-                           isreplacing, B_FALSE)) != 0)
+                           replacing, B_FALSE)) != 0)
                                return (ret);
 
        return (0);
@@ -1360,6 +1360,52 @@ construct_spec(int argc, char **argv)
        return (nvroot);
 }
 
+nvlist_t *
+split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
+    splitflags_t flags, int argc, char **argv)
+{
+       nvlist_t *newroot = NULL, **child;
+       uint_t c, children;
+
+       if (argc > 0) {
+               if ((newroot = construct_spec(argc, argv)) == NULL) {
+                       (void) fprintf(stderr, gettext("Unable to build a "
+                           "pool from the specified devices\n"));
+                       return (NULL);
+               }
+
+               if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+                       nvlist_free(newroot);
+                       return (NULL);
+               }
+
+               /* avoid any tricks in the spec */
+               verify(nvlist_lookup_nvlist_array(newroot,
+                   ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+               for (c = 0; c < children; c++) {
+                       char *path;
+                       const char *type;
+                       int min, max;
+
+                       verify(nvlist_lookup_string(child[c],
+                           ZPOOL_CONFIG_PATH, &path) == 0);
+                       if ((type = is_grouping(path, &min, &max)) != NULL) {
+                               (void) fprintf(stderr, gettext("Cannot use "
+                                   "'%s' as a device for splitting\n"), type);
+                               nvlist_free(newroot);
+                               return (NULL);
+                       }
+               }
+       }
+
+       if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
+               if (newroot != NULL)
+                       nvlist_free(newroot);
+               return (NULL);
+       }
+
+       return (newroot);
+}
 
 /*
  * Get and validate the contents of the given vdev specification.  This ensures
@@ -1373,7 +1419,7 @@ construct_spec(int argc, char **argv)
  */
 nvlist_t *
 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
-    boolean_t isreplacing, boolean_t dryrun, int argc, char **argv)
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv)
 {
        nvlist_t *newroot;
        nvlist_t *poolconfig = NULL;
@@ -1396,8 +1442,7 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
         * uses (such as a dedicated dump device) that even '-f' cannot
         * override.
         */
-       if (check_in_use(poolconfig, newroot, force, isreplacing,
-           B_FALSE) != 0) {
+       if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) {
                nvlist_free(newroot);
                return (NULL);
        }
index 5f49fd5..eed92ec 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/zil.h>
+#include <sys/zil_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
 #include <sys/refcount.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <ctype.h>
 #include <math.h>
 #include <sys/fs/zfs.h>
+#include <libnvpair.h>
 
 static char cmdname[] = "ztest";
 static char *zopt_pool = cmdname;
@@ -124,142 +126,231 @@ static int zopt_verbose = 0;
 static int zopt_init = 1;
 static char *zopt_dir = "/tmp";
 static uint64_t zopt_time = 300;       /* 5 minutes */
-static int zopt_maxfaults;
+static uint64_t zopt_maxloops = 50;    /* max loops during spa_freeze() */
+
+#define        BT_MAGIC        0x123456789abcdefULL
+#define        MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1)
+
+enum ztest_io_type {
+       ZTEST_IO_WRITE_TAG,
+       ZTEST_IO_WRITE_PATTERN,
+       ZTEST_IO_WRITE_ZEROES,
+       ZTEST_IO_TRUNCATE,
+       ZTEST_IO_SETATTR,
+       ZTEST_IO_TYPES
+};
 
 typedef struct ztest_block_tag {
+       uint64_t        bt_magic;
        uint64_t        bt_objset;
        uint64_t        bt_object;
        uint64_t        bt_offset;
+       uint64_t        bt_gen;
        uint64_t        bt_txg;
-       uint64_t        bt_thread;
-       uint64_t        bt_seq;
+       uint64_t        bt_crtxg;
 } ztest_block_tag_t;
 
-typedef struct ztest_args {
-       char            za_pool[MAXNAMELEN];
-       spa_t           *za_spa;
-       objset_t        *za_os;
-       zilog_t         *za_zilog;
-       thread_t        za_thread;
-       uint64_t        za_instance;
-       uint64_t        za_random;
-       uint64_t        za_diroff;
-       uint64_t        za_diroff_shared;
-       uint64_t        za_zil_seq;
-       hrtime_t        za_start;
-       hrtime_t        za_stop;
-       hrtime_t        za_kill;
-       /*
-        * Thread-local variables can go here to aid debugging.
-        */
-       ztest_block_tag_t za_rbt;
-       ztest_block_tag_t za_wbt;
-       dmu_object_info_t za_doi;
-       dmu_buf_t       *za_dbuf;
-} ztest_args_t;
+typedef struct bufwad {
+       uint64_t        bw_index;
+       uint64_t        bw_txg;
+       uint64_t        bw_data;
+} bufwad_t;
+
+/*
+ * XXX -- fix zfs range locks to be generic so we can use them here.
+ */
+typedef enum {
+       RL_READER,
+       RL_WRITER,
+       RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+       void            *rll_writer;
+       int             rll_readers;
+       mutex_t         rll_lock;
+       cond_t          rll_cv;
+} rll_t;
+
+typedef struct rl {
+       uint64_t        rl_object;
+       uint64_t        rl_offset;
+       uint64_t        rl_size;
+       rll_t           *rl_lock;
+} rl_t;
+
+#define        ZTEST_RANGE_LOCKS       64
+#define        ZTEST_OBJECT_LOCKS      64
+
+/*
+ * Object descriptor.  Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+       uint64_t        od_dir;
+       uint64_t        od_object;
+       dmu_object_type_t od_type;
+       dmu_object_type_t od_crtype;
+       uint64_t        od_blocksize;
+       uint64_t        od_crblocksize;
+       uint64_t        od_gen;
+       uint64_t        od_crgen;
+       char            od_name[MAXNAMELEN];
+} ztest_od_t;
 
-typedef void ztest_func_t(ztest_args_t *);
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+       objset_t        *zd_os;
+       zilog_t         *zd_zilog;
+       uint64_t        zd_seq;
+       ztest_od_t      *zd_od;         /* debugging aid */
+       char            zd_name[MAXNAMELEN];
+       mutex_t         zd_dirobj_lock;
+       rll_t           zd_object_lock[ZTEST_OBJECT_LOCKS];
+       rll_t           zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+       ztest_func_t    *zi_func;       /* test function */
+       uint64_t        zi_iters;       /* iterations per execution */
+       uint64_t        *zi_interval;   /* execute every <interval> seconds */
+       uint64_t        zi_call_count;  /* per-pass count */
+       uint64_t        zi_call_time;   /* per-pass time */
+       uint64_t        zi_call_next;   /* next time to call this function */
+} ztest_info_t;
 
 /*
  * Note: these aren't static because we want dladdr() to work.
  */
 ztest_func_t ztest_dmu_read_write;
-ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
-ztest_func_t ztest_traverse;
-ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
+ztest_func_t ztest_fzap;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
-ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_spa_prop_get_set;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_ddt_repair;
+ztest_func_t ztest_dmu_snapshot_hold;
 ztest_func_t ztest_spa_rename;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
-ztest_func_t ztest_scrub;
-
-typedef struct ztest_info {
-       ztest_func_t    *zi_func;       /* test function */
-       uint64_t        zi_iters;       /* iterations per execution */
-       uint64_t        *zi_interval;   /* execute every <interval> seconds */
-       uint64_t        zi_calls;       /* per-pass count */
-       uint64_t        zi_call_time;   /* per-pass time */
-       uint64_t        zi_call_total;  /* cumulative total */
-       uint64_t        zi_call_target; /* target cumulative total */
-} ztest_info_t;
+ztest_func_t ztest_split_pool;
 
-uint64_t zopt_always = 0;              /* all the time */
-uint64_t zopt_often = 1;               /* every second */
-uint64_t zopt_sometimes = 10;          /* every 10 seconds */
-uint64_t zopt_rarely = 60;             /* every 60 seconds */
+uint64_t zopt_always = 0ULL * NANOSEC;         /* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC;          /* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC;     /* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC;                /* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
        { ztest_dmu_read_write,                 1,      &zopt_always    },
-       { ztest_dmu_read_write_zcopy,           1,      &zopt_always    },
-       { ztest_dmu_write_parallel,             30,     &zopt_always    },
+       { ztest_dmu_write_parallel,             10,     &zopt_always    },
        { ztest_dmu_object_alloc_free,          1,      &zopt_always    },
+       { ztest_dmu_commit_callbacks,           1,      &zopt_always    },
        { ztest_zap,                            30,     &zopt_always    },
        { ztest_zap_parallel,                   100,    &zopt_always    },
-       { ztest_dsl_prop_get_set,               1,      &zopt_sometimes },
-       { ztest_dmu_objset_create_destroy,      1,      &zopt_sometimes },
-       { ztest_dmu_snapshot_create_destroy,    1,      &zopt_sometimes },
-       { ztest_spa_create_destroy,             1,      &zopt_sometimes },
+       { ztest_split_pool,                     1,      &zopt_always    },
+       { ztest_zil_commit,                     1,      &zopt_incessant },
+       { ztest_dmu_read_write_zcopy,           1,      &zopt_often     },
+       { ztest_dmu_objset_create_destroy,      1,      &zopt_often     },
+       { ztest_dsl_prop_get_set,               1,      &zopt_often     },
+       { ztest_spa_prop_get_set,               1,      &zopt_sometimes },
+#if 0
+       { ztest_dmu_prealloc,                   1,      &zopt_sometimes },
+#endif
+       { ztest_fzap,                           1,      &zopt_sometimes },
+       { ztest_dmu_snapshot_create_destroy,    1,      &zopt_sometimes },
+       { ztest_spa_create_destroy,             1,      &zopt_sometimes },
        { ztest_fault_inject,                   1,      &zopt_sometimes },
+       { ztest_ddt_repair,                     1,      &zopt_sometimes },
+       { ztest_dmu_snapshot_hold,              1,      &zopt_sometimes },
        { ztest_spa_rename,                     1,      &zopt_rarely    },
-       { ztest_vdev_attach_detach,             1,      &zopt_rarely    },
-       { ztest_vdev_LUN_growth,                1,      &zopt_rarely    },
+       { ztest_scrub,                          1,      &zopt_rarely    },
        { ztest_dsl_dataset_promote_busy,       1,      &zopt_rarely    },
-       { ztest_vdev_add_remove,                1,      &zopt_vdevtime  },
+       { ztest_vdev_attach_detach,             1,      &zopt_rarely },
+       { ztest_vdev_LUN_growth,                1,      &zopt_rarely    },
+       { ztest_vdev_add_remove,                1,      &zopt_vdevtime },
        { ztest_vdev_aux_add_remove,            1,      &zopt_vdevtime  },
-       { ztest_scrub,                          1,      &zopt_vdevtime  },
 };
 
 #define        ZTEST_FUNCS     (sizeof (ztest_info) / sizeof (ztest_info_t))
 
-#define        ZTEST_SYNC_LOCKS        16
+/*
+ * The following struct is used to hold a list of uncalled commit callbacks.
+ * The callbacks are ordered by txg number.
+ */
+typedef struct ztest_cb_list {
+       mutex_t zcl_callbacks_lock;
+       list_t  zcl_callbacks;
+} ztest_cb_list_t;
 
 /*
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
-       mutex_t         zs_vdev_lock;
-       rwlock_t        zs_name_lock;
-       uint64_t        zs_vdev_primaries;
-       uint64_t        zs_vdev_aux;
+       char            *zs_pool;
+       spa_t           *zs_spa;
+       hrtime_t        zs_proc_start;
+       hrtime_t        zs_proc_stop;
+       hrtime_t        zs_thread_start;
+       hrtime_t        zs_thread_stop;
+       hrtime_t        zs_thread_kill;
        uint64_t        zs_enospc_count;
-       hrtime_t        zs_start_time;
-       hrtime_t        zs_stop_time;
+       uint64_t        zs_vdev_next_leaf;
+       uint64_t        zs_vdev_aux;
        uint64_t        zs_alloc;
        uint64_t        zs_space;
+       mutex_t         zs_vdev_lock;
+       rwlock_t        zs_name_lock;
        ztest_info_t    zs_info[ZTEST_FUNCS];
-       mutex_t         zs_sync_lock[ZTEST_SYNC_LOCKS];
-       uint64_t        zs_seq[ZTEST_SYNC_LOCKS];
+       uint64_t        zs_splits;
+       uint64_t        zs_mirrors;
+       ztest_ds_t      zs_zd[];
 } ztest_shared_t;
 
+#define        ID_PARALLEL     -1ULL
+
 static char ztest_dev_template[] = "%s/%s.%llua";
 static char ztest_aux_template[] = "%s/%s.%s.%llu";
-static ztest_shared_t *ztest_shared;
+ztest_shared_t *ztest_shared;
+uint64_t *ztest_seq;
 
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
-static uint64_t metaslab_sz;
 static boolean_t ztest_exiting;
 
+/* Global commit callback list */
+static ztest_cb_list_t zcl;
+
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
+static uint64_t metaslab_sz;
 
-#define        ZTEST_DIROBJ            1
-#define        ZTEST_MICROZAP_OBJ      2
-#define        ZTEST_FATZAP_OBJ        3
-
-#define        ZTEST_DIROBJ_BLOCKSIZE  (1 << 10)
-#define        ZTEST_DIRSIZE           256
+enum ztest_object {
+       ZTEST_META_DNODE = 0,
+       ZTEST_DIROBJ,
+       ZTEST_OBJECTS
+};
 
 static void usage(boolean_t) __NORETURN;
 
@@ -377,21 +468,22 @@ usage(boolean_t requested)
        (void) fprintf(fp, "Usage: %s\n"
            "\t[-v vdevs (default: %llu)]\n"
            "\t[-s size_of_each_vdev (default: %s)]\n"
-           "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
+           "\t[-a alignment_shift (default: %d)] use 0 for random\n"
            "\t[-m mirror_copies (default: %d)]\n"
            "\t[-r raidz_disks (default: %d)]\n"
            "\t[-R raidz_parity (default: %d)]\n"
            "\t[-d datasets (default: %d)]\n"
            "\t[-t threads (default: %d)]\n"
            "\t[-g gang_block_threshold (default: %s)]\n"
-           "\t[-i initialize pool i times (default: %d)]\n"
-           "\t[-k kill percentage (default: %llu%%)]\n"
+           "\t[-i init_count (default: %d)] initialize pool i times\n"
+           "\t[-k kill_percentage (default: %llu%%)]\n"
            "\t[-p pool_name (default: %s)]\n"
-           "\t[-f file directory for vdev files (default: %s)]\n"
-           "\t[-V(erbose)] (use multiple times for ever more blather)\n"
-           "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
-           "\t[-T time] total run time (default: %llu sec)\n"
-           "\t[-P passtime] time per pass (default: %llu sec)\n"
+           "\t[-f dir (default: %s)] file directory for vdev files\n"
+           "\t[-V] verbose (use multiple times for ever more blather)\n"
+           "\t[-E] use existing pool instead of creating new one\n"
+           "\t[-T time (default: %llu sec)] total run time\n"
+           "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
+           "\t[-P passtime (default: %llu sec)] time per pass\n"
            "\t[-h] (print help)\n"
            "",
            cmdname,
@@ -409,31 +501,11 @@ usage(boolean_t requested)
            zopt_pool,                                  /* -p */
            zopt_dir,                                   /* -f */
            (u_longlong_t)zopt_time,                    /* -T */
+           (u_longlong_t)zopt_maxloops,                /* -F */
            (u_longlong_t)zopt_passtime);               /* -P */
        exit(requested ? 0 : 1);
 }
 
-static uint64_t
-ztest_random(uint64_t range)
-{
-       uint64_t r;
-
-       if (range == 0)
-               return (0);
-
-       if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
-               fatal(1, "short read from /dev/urandom");
-
-       return (r % range);
-}
-
-/* ARGSUSED */
-static void
-ztest_record_enospc(char *s)
-{
-       ztest_shared->zs_enospc_count++;
-}
-
 static void
 process_options(int argc, char **argv)
 {
@@ -444,7 +516,7 @@ process_options(int argc, char **argv)
        metaslab_gang_bang = 32 << 10;
 
        while ((opt = getopt(argc, argv,
-           "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) {
+           "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:")) != EOF) {
                value = 0;
                switch (opt) {
                case 'v':
@@ -460,6 +532,7 @@ process_options(int argc, char **argv)
                case 'k':
                case 'T':
                case 'P':
+               case 'F':
                        value = nicenumtoull(optarg);
                }
                switch (opt) {
@@ -514,6 +587,9 @@ process_options(int argc, char **argv)
                case 'P':
                        zopt_passtime = MAX(1, value);
                        break;
+               case 'F':
+                       zopt_maxloops = MAX(1, value);
+                       break;
                case 'h':
                        usage(B_TRUE);
                        break;
@@ -526,8 +602,37 @@ process_options(int argc, char **argv)
 
        zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
 
-       zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
-       zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
+       zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
+           UINT64_MAX >> 2);
+}
+
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+       zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
+       zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
+       (void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+       uint64_t r;
+
+       if (range == 0)
+               return (0);
+
+       if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+               fatal(1, "short read from /dev/urandom");
+
+       return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+       ztest_shared->zs_enospc_count++;
 }
 
 static uint64_t
@@ -556,7 +661,7 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
                        (void) sprintf(path, ztest_aux_template,
                            zopt_dir, zopt_pool, aux, vdev);
                } else {
-                       vdev = ztest_shared->zs_vdev_primaries++;
+                       vdev = ztest_shared->zs_vdev_next_leaf++;
                        (void) sprintf(path, ztest_dev_template,
                            zopt_dir, zopt_pool, vdev);
                }
@@ -667,249 +772,1466 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
        return (root);
 }
 
-static void
-ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+static int
+ztest_random_blocksize(void)
 {
-       int bs = SPA_MINBLOCKSHIFT +
-           ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
-       int ibs = DN_MIN_INDBLKSHIFT +
-           ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
-       int error;
+       return (1 << (SPA_MINBLOCKSHIFT +
+           ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
+}
 
-       error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
-       if (error) {
-               char osname[300];
-               dmu_objset_name(os, osname);
-               fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
-                   osname, object, 1 << bs, ibs, error);
-       }
+static int
+ztest_random_ibshift(void)
+{
+       return (DN_MIN_INDBLKSHIFT +
+           ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
 }
 
-static uint8_t
-ztest_random_checksum(void)
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
 {
-       uint8_t checksum;
+       uint64_t top;
+       vdev_t *rvd = spa->spa_root_vdev;
+       vdev_t *tvd;
 
-       do {
-               checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
-       } while (zio_checksum_table[checksum].ci_zbt);
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
-       if (checksum == ZIO_CHECKSUM_OFF)
-               checksum = ZIO_CHECKSUM_ON;
+       do {
+               top = ztest_random(rvd->vdev_children);
+               tvd = rvd->vdev_child[top];
+       } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+           tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
 
-       return (checksum);
+       return (top);
 }
 
-static uint8_t
-ztest_random_compress(void)
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
 {
-       return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
+       uint64_t value;
+
+       do {
+               value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+       } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+       return (value);
 }
 
 static int
-ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+    boolean_t inherit)
 {
-       dmu_tx_t *tx;
+       const char *propname = zfs_prop_to_name(prop);
+       const char *valname;
+       char setpoint[MAXPATHLEN];
+       uint64_t curval;
        int error;
 
-       if (byteswap)
-               byteswap_uint64_array(lr, sizeof (*lr));
+       error = dsl_prop_set(osname, propname,
+           (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
+           sizeof (value), 1, &value);
 
-       tx = dmu_tx_create(os);
-       dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-       error = dmu_tx_assign(tx, TXG_WAIT);
-       if (error) {
-               dmu_tx_abort(tx);
+       if (error == ENOSPC) {
+               ztest_record_enospc(FTAG);
                return (error);
        }
-
-       error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
-           DMU_OT_NONE, 0, tx);
        ASSERT3U(error, ==, 0);
-       dmu_tx_commit(tx);
 
-       if (zopt_verbose >= 5) {
-               char osname[MAXNAMELEN];
-               dmu_objset_name(os, osname);
-               (void) printf("replay create of %s object %llu"
-                   " in txg %llu = %d\n",
-                   osname, (u_longlong_t)lr->lr_doid,
-                   (u_longlong_t)dmu_tx_get_txg(tx), error);
+       VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
+           1, &curval, setpoint), ==, 0);
+
+       if (zopt_verbose >= 6) {
+               VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
+               (void) printf("%s %s = %s at '%s'\n",
+                   osname, propname, valname, setpoint);
        }
 
        return (error);
 }
 
 static int
-ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
+ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
 {
-       dmu_tx_t *tx;
+       spa_t *spa = zs->zs_spa;
+       nvlist_t *props = NULL;
        int error;
 
-       if (byteswap)
-               byteswap_uint64_array(lr, sizeof (*lr));
+       VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+       VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
 
-       tx = dmu_tx_create(os);
-       dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
-       error = dmu_tx_assign(tx, TXG_WAIT);
-       if (error) {
-               dmu_tx_abort(tx);
+       error = spa_prop_set(spa, props);
+
+       nvlist_free(props);
+
+       if (error == ENOSPC) {
+               ztest_record_enospc(FTAG);
                return (error);
        }
-
-       error = dmu_object_free(os, lr->lr_doid, tx);
-       dmu_tx_commit(tx);
+       ASSERT3U(error, ==, 0);
 
        return (error);
 }
 
-zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
-       NULL,                   /* 0 no such transaction type */
-       ztest_replay_create,    /* TX_CREATE */
-       NULL,                   /* TX_MKDIR */
-       NULL,                   /* TX_MKXATTR */
-       NULL,                   /* TX_SYMLINK */
-       ztest_replay_remove,    /* TX_REMOVE */
-       NULL,                   /* TX_RMDIR */
-       NULL,                   /* TX_LINK */
-       NULL,                   /* TX_RENAME */
-       NULL,                   /* TX_WRITE */
-       NULL,                   /* TX_TRUNCATE */
-       NULL,                   /* TX_SETATTR */
-       NULL,                   /* TX_ACL */
-};
+static void
+ztest_rll_init(rll_t *rll)
+{
+       rll->rll_writer = NULL;
+       rll->rll_readers = 0;
+       VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+       VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+}
 
-/*
- * Verify that we can't destroy an active pool, create an existing pool,
- * or create a pool with a bad vdev spec.
- */
-void
-ztest_spa_create_destroy(ztest_args_t *za)
+static void
+ztest_rll_destroy(rll_t *rll)
 {
-       int error;
-       spa_t *spa;
-       nvlist_t *nvroot;
+       ASSERT(rll->rll_writer == NULL);
+       ASSERT(rll->rll_readers == 0);
+       VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+       VERIFY(cond_destroy(&rll->rll_cv) == 0);
+}
 
-       /*
-        * Attempt to create using a bad file.
-        */
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-       error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
-       nvlist_free(nvroot);
-       if (error != ENOENT)
-               fatal(0, "spa_create(bad_file) = %d", error);
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+       VERIFY(mutex_lock(&rll->rll_lock) == 0);
 
-       /*
-        * Attempt to create using a bad mirror.
-        */
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
-       error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
-       nvlist_free(nvroot);
-       if (error != ENOENT)
-               fatal(0, "spa_create(bad_mirror) = %d", error);
+       if (type == RL_READER) {
+               while (rll->rll_writer != NULL)
+                       (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+               rll->rll_readers++;
+       } else {
+               while (rll->rll_writer != NULL || rll->rll_readers)
+                       (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+               rll->rll_writer = curthread;
+       }
 
-       /*
-        * Attempt to create an existing pool.  It shouldn't matter
-        * what's in the nvroot; we should fail with EEXIST.
-        */
-       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-       error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
-       nvlist_free(nvroot);
-       if (error != EEXIST)
-               fatal(0, "spa_create(whatever) = %d", error);
+       VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
 
-       error = spa_open(za->za_pool, &spa, FTAG);
-       if (error)
-               fatal(0, "spa_open() = %d", error);
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+       VERIFY(mutex_lock(&rll->rll_lock) == 0);
 
-       error = spa_destroy(za->za_pool);
-       if (error != EBUSY)
-               fatal(0, "spa_destroy() = %d", error);
+       if (rll->rll_writer) {
+               ASSERT(rll->rll_readers == 0);
+               rll->rll_writer = NULL;
+       } else {
+               ASSERT(rll->rll_readers != 0);
+               ASSERT(rll->rll_writer == NULL);
+               rll->rll_readers--;
+       }
 
-       spa_close(spa, FTAG);
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
+       if (rll->rll_writer == NULL && rll->rll_readers == 0)
+               VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+
+       VERIFY(mutex_unlock(&rll->rll_lock) == 0);
 }
 
-static vdev_t *
-vdev_lookup_by_path(vdev_t *vd, const char *path)
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
 {
-       vdev_t *mvd;
+       rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
-       if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
-               return (vd);
+       ztest_rll_lock(rll, type);
+}
 
-       for (int c = 0; c < vd->vdev_children; c++)
-               if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
-                   NULL)
-                       return (mvd);
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+       rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
-       return (NULL);
+       ztest_rll_unlock(rll);
 }
 
-/*
- * Verify that vdev_add() works as expected.
- */
-void
-ztest_vdev_add_remove(ztest_args_t *za)
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+    uint64_t size, rl_type_t type)
 {
-       spa_t *spa = za->za_spa;
-       uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
-       nvlist_t *nvroot;
-       int error;
+       uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+       rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+       rl_t *rl;
 
-       (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+       rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+       rl->rl_object = object;
+       rl->rl_offset = offset;
+       rl->rl_size = size;
+       rl->rl_lock = rll;
 
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+       ztest_rll_lock(rll, type);
+
+       return (rl);
+}
 
-       ztest_shared->zs_vdev_primaries =
-           spa->spa_root_vdev->vdev_children * leaves;
+static void
+ztest_range_unlock(rl_t *rl)
+{
+       rll_t *rll = rl->rl_lock;
 
-       spa_config_exit(spa, SCL_VDEV, FTAG);
+       ztest_rll_unlock(rll);
 
-       /*
-        * Make 1/4 of the devices be log devices.
-        */
-       nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-           ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+       umem_free(rl, sizeof (*rl));
+}
 
-       error = spa_vdev_add(spa, nvroot);
-       nvlist_free(nvroot);
+static void
+ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+{
+       zd->zd_os = os;
+       zd->zd_zilog = dmu_objset_zil(os);
+       zd->zd_seq = 0;
+       dmu_objset_name(os, zd->zd_name);
+
+       VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
 
-       (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+       for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+               ztest_rll_init(&zd->zd_object_lock[l]);
 
-       if (error == ENOSPC)
-               ztest_record_enospc("spa_vdev_add");
-       else if (error != 0)
-               fatal(0, "spa_vdev_add() = %d", error);
+       for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+               ztest_rll_init(&zd->zd_range_lock[l]);
 }
 
-/*
- * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
- */
-void
-ztest_vdev_aux_add_remove(ztest_args_t *za)
+static void
+ztest_zd_fini(ztest_ds_t *zd)
 {
-       spa_t *spa = za->za_spa;
-       vdev_t *rvd = spa->spa_root_vdev;
-       spa_aux_vdev_t *sav;
-       char *aux;
-       uint64_t guid = 0;
-       int error;
+       VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
 
-       if (ztest_random(2) == 0) {
-               sav = &spa->spa_spares;
-               aux = ZPOOL_CONFIG_SPARES;
-       } else {
-               sav = &spa->spa_l2cache;
-               aux = ZPOOL_CONFIG_L2CACHE;
-       }
+       for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+               ztest_rll_destroy(&zd->zd_object_lock[l]);
 
-       (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+       for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+               ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
 
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+#define        TXG_MIGHTWAIT   (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
 
-       if (sav->sav_count != 0 && ztest_random(4) == 0) {
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+{
+       uint64_t txg;
+       int error;
+
+       /*
+        * Attempt to assign tx to some transaction group.
+        */
+       error = dmu_tx_assign(tx, txg_how);
+       if (error) {
+               if (error == ERESTART) {
+                       ASSERT(txg_how == TXG_NOWAIT);
+                       dmu_tx_wait(tx);
+               } else {
+                       ASSERT3U(error, ==, ENOSPC);
+                       ztest_record_enospc(tag);
+               }
+               dmu_tx_abort(tx);
+               return (0);
+       }
+       txg = dmu_tx_get_txg(tx);
+       ASSERT(txg != 0);
+       return (txg);
+}
+
+static void
+ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+{
+       uint64_t *ip = buf;
+       uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+       while (ip < ip_end)
+               *ip++ = value;
+}
+
+static boolean_t
+ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+{
+       uint64_t *ip = buf;
+       uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+       uint64_t diff = 0;
+
+       while (ip < ip_end)
+               diff |= (value - *ip++);
+
+       return (diff == 0);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+       bt->bt_magic = BT_MAGIC;
+       bt->bt_objset = dmu_objset_id(os);
+       bt->bt_object = object;
+       bt->bt_offset = offset;
+       bt->bt_gen = gen;
+       bt->bt_txg = txg;
+       bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+       ASSERT(bt->bt_magic == BT_MAGIC);
+       ASSERT(bt->bt_objset == dmu_objset_id(os));
+       ASSERT(bt->bt_object == object);
+       ASSERT(bt->bt_offset == offset);
+       ASSERT(bt->bt_gen <= gen);
+       ASSERT(bt->bt_txg <= txg);
+       ASSERT(bt->bt_crtxg == crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+       dmu_object_info_t doi;
+       ztest_block_tag_t *bt;
+
+       dmu_object_info_from_db(db, &doi);
+       ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+       ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+       bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+       return (bt);
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define        lrz_type        lr_mode
+#define        lrz_blocksize   lr_uid
+#define        lrz_ibshift     lr_gid
+#define        lrz_bonustype   lr_rdev
+#define        lrz_bonuslen    lr_crtime[1]
+
+static uint64_t
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       size_t namesize = strlen(name) + 1;
+       itx_t *itx;
+
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+
+       itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) + namesize - sizeof (lr_t));
+
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+{
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       size_t namesize = strlen(name) + 1;
+       itx_t *itx;
+
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+
+       itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) + namesize - sizeof (lr_t));
+
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+       itx_t *itx;
+       itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+
+       if (lr->lr_length > ZIL_MAX_LOG_DATA)
+               write_state = WR_INDIRECT;
+
+       itx = zil_itx_create(TX_WRITE,
+           sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+       if (write_state == WR_COPIED &&
+           dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+           ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+               zil_itx_destroy(itx);
+               itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+               write_state = WR_NEED_COPY;
+       }
+       itx->itx_private = zd;
+       itx->itx_wr_state = write_state;
+       itx->itx_sync = (ztest_random(8) == 0);
+       itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) - sizeof (lr_t));
+
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+       itx_t *itx;
+
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+
+       itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) - sizeof (lr_t));
+
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+       itx_t *itx;
+
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+
+       itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) - sizeof (lr_t));
+
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+{
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       objset_t *os = zd->zd_os;
+       ztest_block_tag_t *bbt;
+       dmu_buf_t *db;
+       dmu_tx_t *tx;
+       uint64_t txg;
+       int error = 0;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+       ASSERT(name[0] != '\0');
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+       if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+       } else {
+               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+       }
+
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0)
+               return (ENOSPC);
+
+       ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+       if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+               if (lr->lr_foid == 0) {
+                       lr->lr_foid = zap_create(os,
+                           lr->lrz_type, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               } else {
+                       error = zap_create_claim(os, lr->lr_foid,
+                           lr->lrz_type, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               }
+       } else {
+               if (lr->lr_foid == 0) {
+                       lr->lr_foid = dmu_object_alloc(os,
+                           lr->lrz_type, 0, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               } else {
+                       error = dmu_object_claim(os, lr->lr_foid,
+                           lr->lrz_type, 0, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               }
+       }
+
+       if (error) {
+               ASSERT3U(error, ==, EEXIST);
+               ASSERT(zd->zd_zilog->zl_replay);
+               dmu_tx_commit(tx);
+               return (error);
+       }
+
+       ASSERT(lr->lr_foid != 0);
+
+       if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+               VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+                   lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+       VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+       bbt = ztest_bt_bonus(db);
+       dmu_buf_will_dirty(db, tx);
+       ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+       dmu_buf_rele(db, FTAG);
+
+       VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+           &lr->lr_foid, tx));
+
+       (void) ztest_log_create(zd, tx, lr);
+
+       dmu_tx_commit(tx);
+
+       return (0);
+}
+
+static int
+ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+{
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       objset_t *os = zd->zd_os;
+       dmu_object_info_t doi;
+       dmu_tx_t *tx;
+       uint64_t object, txg;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+       ASSERT(name[0] != '\0');
+
+       VERIFY3U(0, ==,
+           zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+       ASSERT(object != 0);
+
+       ztest_object_lock(zd, object, RL_WRITER);
+
+       VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+       dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               ztest_object_unlock(zd, object);
+               return (ENOSPC);
+       }
+
+       if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+               VERIFY3U(0, ==, zap_destroy(os, object, tx));
+       } else {
+               VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+       }
+
+       VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+       (void) ztest_log_remove(zd, tx, lr);
+
+       dmu_tx_commit(tx);
+
+       ztest_object_unlock(zd, object);
+
+       return (0);
+}
+
+static int
+ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+{
+       objset_t *os = zd->zd_os;
+       void *data = lr + 1;                    /* data follows lr */
+       uint64_t offset, length;
+       ztest_block_tag_t *bt = data;
+       ztest_block_tag_t *bbt;
+       uint64_t gen, txg, lrtxg, crtxg;
+       dmu_object_info_t doi;
+       dmu_tx_t *tx;
+       dmu_buf_t *db;
+       arc_buf_t *abuf = NULL;
+       rl_t *rl;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       offset = lr->lr_offset;
+       length = lr->lr_length;
+
+       /* If it's a dmu_sync() block, write the whole block */
+       if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+               uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+               if (length < blocksize) {
+                       offset -= offset % blocksize;
+                       length = blocksize;
+               }
+       }
+
+       if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+               byteswap_uint64_array(bt, sizeof (*bt));
+
+       if (bt->bt_magic != BT_MAGIC)
+               bt = NULL;
+
+       ztest_object_lock(zd, lr->lr_foid, RL_READER);
+       rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+       VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+       dmu_object_info_from_db(db, &doi);
+
+       bbt = ztest_bt_bonus(db);
+       ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+       gen = bbt->bt_gen;
+       crtxg = bbt->bt_crtxg;
+       lrtxg = lr->lr_common.lrc_txg;
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+       if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+           P2PHASE(offset, length) == 0)
+               abuf = dmu_request_arcbuf(db, length);
+
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               if (abuf != NULL)
+                       dmu_return_arcbuf(abuf);
+               dmu_buf_rele(db, FTAG);
+               ztest_range_unlock(rl);
+               ztest_object_unlock(zd, lr->lr_foid);
+               return (ENOSPC);
+       }
+
+       if (bt != NULL) {
+               /*
+                * Usually, verify the old data before writing new data --
+                * but not always, because we also want to verify correct
+                * behavior when the data was not recently read into cache.
+                */
+               ASSERT(offset % doi.doi_data_block_size == 0);
+               if (ztest_random(4) != 0) {
+                       int prefetch = ztest_random(2) ?
+                           DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+                       ztest_block_tag_t rbt;
+
+                       VERIFY(dmu_read(os, lr->lr_foid, offset,
+                           sizeof (rbt), &rbt, prefetch) == 0);
+                       if (rbt.bt_magic == BT_MAGIC) {
+                               ztest_bt_verify(&rbt, os, lr->lr_foid,
+                                   offset, gen, txg, crtxg);
+                       }
+               }
+
+               /*
+                * Writes can appear to be newer than the bonus buffer because
+                * the ztest_get_data() callback does a dmu_read() of the
+                * open-context data, which may be different than the data
+                * as it was when the write was generated.
+                */
+               if (zd->zd_zilog->zl_replay) {
+                       ztest_bt_verify(bt, os, lr->lr_foid, offset,
+                           MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+                           bt->bt_crtxg);
+               }
+
+               /*
+                * Set the bt's gen/txg to the bonus buffer's gen/txg
+                * so that all of the usual ASSERTs will work.
+                */
+               ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+       }
+
+       if (abuf == NULL) {
+               dmu_write(os, lr->lr_foid, offset, length, data, tx);
+       } else {
+               bcopy(data, abuf->b_data, length);
+               dmu_assign_arcbuf(db, offset, abuf, tx);
+       }
+
+       (void) ztest_log_write(zd, tx, lr);
+
+       dmu_buf_rele(db, FTAG);
+
+       dmu_tx_commit(tx);
+
+       ztest_range_unlock(rl);
+       ztest_object_unlock(zd, lr->lr_foid);
+
+       return (0);
+}
+
+static int
+ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+{
+       objset_t *os = zd->zd_os;
+       dmu_tx_t *tx;
+       uint64_t txg;
+       rl_t *rl;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       ztest_object_lock(zd, lr->lr_foid, RL_READER);
+       rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+           RL_WRITER);
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               ztest_range_unlock(rl);
+               ztest_object_unlock(zd, lr->lr_foid);
+               return (ENOSPC);
+       }
+
+       VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+           lr->lr_length, tx) == 0);
+
+       (void) ztest_log_truncate(zd, tx, lr);
+
+       dmu_tx_commit(tx);
+
+       ztest_range_unlock(rl);
+       ztest_object_unlock(zd, lr->lr_foid);
+
+       return (0);
+}
+
+static int
+ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+{
+       objset_t *os = zd->zd_os;
+       dmu_tx_t *tx;
+       dmu_buf_t *db;
+       ztest_block_tag_t *bbt;
+       uint64_t txg, lrtxg, crtxg;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+       VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               dmu_buf_rele(db, FTAG);
+               ztest_object_unlock(zd, lr->lr_foid);
+               return (ENOSPC);
+       }
+
+       bbt = ztest_bt_bonus(db);
+       ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+       crtxg = bbt->bt_crtxg;
+       lrtxg = lr->lr_common.lrc_txg;
+
+       if (zd->zd_zilog->zl_replay) {
+               ASSERT(lr->lr_size != 0);
+               ASSERT(lr->lr_mode != 0);
+               ASSERT(lrtxg != 0);
+       } else {
+               /*
+                * Randomly change the size and increment the generation.
+                */
+               lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+                   sizeof (*bbt);
+               lr->lr_mode = bbt->bt_gen + 1;
+               ASSERT(lrtxg == 0);
+       }
+
+       /*
+        * Verify that the current bonus buffer is not newer than our txg.
+        */
+       ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+           MAX(txg, lrtxg), crtxg);
+
+       dmu_buf_will_dirty(db, tx);
+
+       ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+       ASSERT3U(lr->lr_size, <=, db->db_size);
+       VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+       bbt = ztest_bt_bonus(db);
+
+       ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+       dmu_buf_rele(db, FTAG);
+
+       (void) ztest_log_setattr(zd, tx, lr);
+
+       dmu_tx_commit(tx);
+
+       ztest_object_unlock(zd, lr->lr_foid);
+
+       return (0);
+}
+
+zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+       NULL,                   /* 0 no such transaction type */
+       ztest_replay_create,    /* TX_CREATE */
+       NULL,                   /* TX_MKDIR */
+       NULL,                   /* TX_MKXATTR */
+       NULL,                   /* TX_SYMLINK */
+       ztest_replay_remove,    /* TX_REMOVE */
+       NULL,                   /* TX_RMDIR */
+       NULL,                   /* TX_LINK */
+       NULL,                   /* TX_RENAME */
+       ztest_replay_write,     /* TX_WRITE */
+       ztest_replay_truncate,  /* TX_TRUNCATE */
+       ztest_replay_setattr,   /* TX_SETATTR */
+       NULL,                   /* TX_ACL */
+       NULL,                   /* TX_CREATE_ACL */
+       NULL,                   /* TX_CREATE_ATTR */
+       NULL,                   /* TX_CREATE_ACL_ATTR */
+       NULL,                   /* TX_MKDIR_ACL */
+       NULL,                   /* TX_MKDIR_ATTR */
+       NULL,                   /* TX_MKDIR_ACL_ATTR */
+       NULL,                   /* TX_WRITE2 */
+};
+
+/*
+ * ZIL get_data callbacks
+ */
+
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+       ztest_ds_t *zd = zgd->zgd_private;
+       uint64_t object = zgd->zgd_rl->rl_object;
+
+       if (zgd->zgd_db)
+               dmu_buf_rele(zgd->zgd_db, zgd);
+
+       ztest_range_unlock(zgd->zgd_rl);
+       ztest_object_unlock(zd, object);
+
+       if (error == 0 && zgd->zgd_bp)
+               zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+       umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+       ztest_ds_t *zd = arg;
+       objset_t *os = zd->zd_os;
+       uint64_t object = lr->lr_foid;
+       uint64_t offset = lr->lr_offset;
+       uint64_t size = lr->lr_length;
+       blkptr_t *bp = &lr->lr_blkptr;
+       uint64_t txg = lr->lr_common.lrc_txg;
+       uint64_t crtxg;
+       dmu_object_info_t doi;
+       dmu_buf_t *db;
+       zgd_t *zgd;
+       int error;
+
+       ztest_object_lock(zd, object, RL_READER);
+       error = dmu_bonus_hold(os, object, FTAG, &db);
+       if (error) {
+               ztest_object_unlock(zd, object);
+               return (error);
+       }
+
+       crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+       if (crtxg == 0 || crtxg > txg) {
+               dmu_buf_rele(db, FTAG);
+               ztest_object_unlock(zd, object);
+               return (ENOENT);
+       }
+
+       dmu_object_info_from_db(db, &doi);
+       dmu_buf_rele(db, FTAG);
+       db = NULL;
+
+       zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+       zgd->zgd_zilog = zd->zd_zilog;
+       zgd->zgd_private = zd;
+
+       if (buf != NULL) {      /* immediate write */
+               zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+                   RL_READER);
+
+               error = dmu_read(os, object, offset, size, buf,
+                   DMU_READ_NO_PREFETCH);
+               ASSERT(error == 0);
+       } else {
+               size = doi.doi_data_block_size;
+               if (ISP2(size)) {
+                       offset = P2ALIGN(offset, size);
+               } else {
+                       ASSERT(offset < size);
+                       offset = 0;
+               }
+
+               zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+                   RL_READER);
+
+               error = dmu_buf_hold(os, object, offset, zgd, &db,
+                   DMU_READ_NO_PREFETCH);
+
+               if (error == 0) {
+                       zgd->zgd_db = db;
+                       zgd->zgd_bp = bp;
+
+                       ASSERT(db->db_offset == offset);
+                       ASSERT(db->db_size == size);
+
+                       error = dmu_sync(zio, lr->lr_common.lrc_txg,
+                           ztest_get_done, zgd);
+
+                       if (error == 0)
+                               return (0);
+               }
+       }
+
+       ztest_get_done(zgd, error);
+
+       return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+       char *lr;
+       size_t namesize = name ? strlen(name) + 1 : 0;
+
+       lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+       if (name)
+               bcopy(name, lr + lrsize, namesize);
+
+       return (lr);
+}
+
+void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+       size_t namesize = name ? strlen(name) + 1 : 0;
+
+       umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects.  Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+       int missing = 0;
+       int error;
+
+       ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+       for (int i = 0; i < count; i++, od++) {
+               od->od_object = 0;
+               error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+                   sizeof (uint64_t), 1, &od->od_object);
+               if (error) {
+                       ASSERT(error == ENOENT);
+                       ASSERT(od->od_object == 0);
+                       missing++;
+               } else {
+                       dmu_buf_t *db;
+                       ztest_block_tag_t *bbt;
+                       dmu_object_info_t doi;
+
+                       ASSERT(od->od_object != 0);
+                       ASSERT(missing == 0);   /* there should be no gaps */
+
+                       ztest_object_lock(zd, od->od_object, RL_READER);
+                       VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+                           od->od_object, FTAG, &db));
+                       dmu_object_info_from_db(db, &doi);
+                       bbt = ztest_bt_bonus(db);
+                       ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+                       od->od_type = doi.doi_type;
+                       od->od_blocksize = doi.doi_data_block_size;
+                       od->od_gen = bbt->bt_gen;
+                       dmu_buf_rele(db, FTAG);
+                       ztest_object_unlock(zd, od->od_object);
+               }
+       }
+
+       return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+       int missing = 0;
+
+       ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+       for (int i = 0; i < count; i++, od++) {
+               if (missing) {
+                       od->od_object = 0;
+                       missing++;
+                       continue;
+               }
+
+               lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+               lr->lr_doid = od->od_dir;
+               lr->lr_foid = 0;        /* 0 to allocate, > 0 to claim */
+               lr->lrz_type = od->od_crtype;
+               lr->lrz_blocksize = od->od_crblocksize;
+               lr->lrz_ibshift = ztest_random_ibshift();
+               lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+               lr->lrz_bonuslen = dmu_bonus_max();
+               lr->lr_gen = od->od_crgen;
+               lr->lr_crtime[0] = time(NULL);
+
+               if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+                       ASSERT(missing == 0);
+                       od->od_object = 0;
+                       missing++;
+               } else {
+                       od->od_object = lr->lr_foid;
+                       od->od_type = od->od_crtype;
+                       od->od_blocksize = od->od_crblocksize;
+                       od->od_gen = od->od_crgen;
+                       ASSERT(od->od_object != 0);
+               }
+
+               ztest_lr_free(lr, sizeof (*lr), od->od_name);
+       }
+
+       return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+       int missing = 0;
+       int error;
+
+       ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+       od += count - 1;
+
+       for (int i = count - 1; i >= 0; i--, od--) {
+               if (missing) {
+                       missing++;
+                       continue;
+               }
+
+               if (od->od_object == 0)
+                       continue;
+
+               lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+               lr->lr_doid = od->od_dir;
+
+               if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+                       ASSERT3U(error, ==, ENOSPC);
+                       missing++;
+               } else {
+                       od->od_object = 0;
+               }
+               ztest_lr_free(lr, sizeof (*lr), od->od_name);
+       }
+
+       return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+    void *data)
+{
+       lr_write_t *lr;
+       int error;
+
+       lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+       lr->lr_foid = object;
+       lr->lr_offset = offset;
+       lr->lr_length = size;
+       lr->lr_blkoff = 0;
+       BP_ZERO(&lr->lr_blkptr);
+
+       bcopy(data, lr + 1, size);
+
+       error = ztest_replay_write(zd, lr, B_FALSE);
+
+       ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+       return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+       lr_truncate_t *lr;
+       int error;
+
+       lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+       lr->lr_foid = object;
+       lr->lr_offset = offset;
+       lr->lr_length = size;
+
+       error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+       ztest_lr_free(lr, sizeof (*lr), NULL);
+
+       return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+       lr_setattr_t *lr;
+       int error;
+
+       lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+       lr->lr_foid = object;
+       lr->lr_size = 0;
+       lr->lr_mode = 0;
+
+       error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+       ztest_lr_free(lr, sizeof (*lr), NULL);
+
+       return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+       objset_t *os = zd->zd_os;
+       dmu_tx_t *tx;
+       uint64_t txg;
+       rl_t *rl;
+
+       txg_wait_synced(dmu_objset_pool(os), 0);
+
+       ztest_object_lock(zd, object, RL_READER);
+       rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_write(tx, object, offset, size);
+
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+       if (txg != 0) {
+               dmu_prealloc(os, object, offset, size, tx);
+               dmu_tx_commit(tx);
+               txg_wait_synced(dmu_objset_pool(os), txg);
+       } else {
+               (void) dmu_free_long_range(os, object, offset, size);
+       }
+
+       ztest_range_unlock(rl);
+       ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+       ztest_block_tag_t wbt;
+       dmu_object_info_t doi;
+       enum ztest_io_type io_type;
+       uint64_t blocksize;
+       void *data;
+
+       VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+       blocksize = doi.doi_data_block_size;
+       data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+       /*
+        * Pick an i/o type at random, biased toward writing block tags.
+        */
+       io_type = ztest_random(ZTEST_IO_TYPES);
+       if (ztest_random(2) == 0)
+               io_type = ZTEST_IO_WRITE_TAG;
+
+       switch (io_type) {
+
+       case ZTEST_IO_WRITE_TAG:
+               ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+               (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+               break;
+
+       case ZTEST_IO_WRITE_PATTERN:
+               (void) memset(data, 'a' + (object + offset) % 5, blocksize);
+               if (ztest_random(2) == 0) {
+                       /*
+                        * Induce fletcher2 collisions to ensure that
+                        * zio_ddt_collision() detects and resolves them
+                        * when using fletcher2-verify for deduplication.
+                        */
+                       ((uint64_t *)data)[0] ^= 1ULL << 63;
+                       ((uint64_t *)data)[4] ^= 1ULL << 63;
+               }
+               (void) ztest_write(zd, object, offset, blocksize, data);
+               break;
+
+       case ZTEST_IO_WRITE_ZEROES:
+               bzero(data, blocksize);
+               (void) ztest_write(zd, object, offset, blocksize, data);
+               break;
+
+       case ZTEST_IO_TRUNCATE:
+               (void) ztest_truncate(zd, object, offset, blocksize);
+               break;
+
+       case ZTEST_IO_SETATTR:
+               (void) ztest_setattr(zd, object);
+               break;
+       }
+
+       umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+    dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+{
+       od->od_dir = ZTEST_DIROBJ;
+       od->od_object = 0;
+
+       od->od_crtype = type;
+       od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+       od->od_crgen = gen;
+
+       od->od_type = DMU_OT_NONE;
+       od->od_blocksize = 0;
+       od->od_gen = 0;
+
+       (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+           tag, (int64_t)id, index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones.  Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+       int count = size / sizeof (*od);
+       int rv = 0;
+
+       VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+       if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+           (ztest_remove(zd, od, count) != 0 ||
+           ztest_create(zd, od, count) != 0))
+               rv = -1;
+       zd->zd_od = od;
+       VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+
+       return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+       zilog_t *zilog = zd->zd_zilog;
+
+       zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+
+       /*
+        * Remember the committed values in zd, which is in parent/child
+        * shared memory.  If we die, the next iteration of ztest_run()
+        * will verify that the log really does contain this record.
+        */
+       mutex_enter(&zilog->zl_lock);
+       ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+       zd->zd_seq = zilog->zl_commit_lr_seq;
+       mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+/* ARGSUSED */
+void
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa;
+       nvlist_t *nvroot;
+
+       /*
+        * Attempt to create using a bad file.
+        */
+       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+       VERIFY3U(ENOENT, ==,
+           spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+       nvlist_free(nvroot);
+
+       /*
+        * Attempt to create using a bad mirror.
+        */
+       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+       VERIFY3U(ENOENT, ==,
+           spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+       nvlist_free(nvroot);
+
+       /*
+        * Attempt to create an existing pool.  It shouldn't matter
+        * what's in the nvroot; we should fail with EEXIST.
+        */
+       (void) rw_rdlock(&zs->zs_name_lock);
+       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+       VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
+       nvlist_free(nvroot);
+       VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+       VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
+       spa_close(spa, FTAG);
+
+       (void) rw_unlock(&zs->zs_name_lock);
+}
+
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+       vdev_t *mvd;
+
+       if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+               return (vd);
+
+       for (int c = 0; c < vd->vdev_children; c++)
+               if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+                   NULL)
+                       return (mvd);
+
+       return (NULL);
+}
+
+/*
+ * Find the first available hole which can be used as a top-level.
+ */
+int
+find_vdev_hole(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       int c;
+
+       ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+
+       for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *cvd = rvd->vdev_child[c];
+
+               if (cvd->vdev_ishole)
+                       break;
+       }
+       return (c);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       uint64_t leaves;
+       uint64_t guid;
+       nvlist_t *nvroot;
+       int error;
+
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+       leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+
+       /*
+        * If we have slogs then remove them 1/4 of the time.
+        */
+       if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+               /*
+                * Grab the guid from the head of the log class rotor.
+                */
+               guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+
+               spa_config_exit(spa, SCL_VDEV, FTAG);
+
+               /*
+                * We have to grab the zs_name_lock as writer to
+                * prevent a race between removing a slog (dmu_objset_find)
+                * and destroying a dataset. Removing the slog will
+                * grab a reference on the dataset which may cause
+                * dmu_objset_destroy() to fail with EBUSY thus
+                * leaving the dataset in an inconsistent state.
+                */
+               VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+               error = spa_vdev_remove(spa, guid, B_FALSE);
+               VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+
+               if (error && error != EEXIST)
+                       fatal(0, "spa_vdev_remove() = %d", error);
+       } else {
+               spa_config_exit(spa, SCL_VDEV, FTAG);
+
+               /*
+                * Make 1/4 of the devices be log devices.
+                */
+               nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+                   ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+
+               error = spa_vdev_add(spa, nvroot);
+               nvlist_free(nvroot);
+
+               if (error == ENOSPC)
+                       ztest_record_enospc("spa_vdev_add");
+               else if (error != 0)
+                       fatal(0, "spa_vdev_add() = %d", error);
+       }
+
+       VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
+}
+
+/*
+ * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
+       spa_aux_vdev_t *sav;
+       char *aux;
+       uint64_t guid = 0;
+       int error;
+
+       if (ztest_random(2) == 0) {
+               sav = &spa->spa_spares;
+               aux = ZPOOL_CONFIG_SPARES;
+       } else {
+               sav = &spa->spa_l2cache;
+               aux = ZPOOL_CONFIG_L2CACHE;
+       }
+
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       if (sav->sav_count != 0 && ztest_random(4) == 0) {
                /*
                 * Pick a random device to remove.
                 */
@@ -918,12 +2240,12 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
                /*
                 * Find an unused device we can add.
                 */
-               ztest_shared->zs_vdev_aux = 0;
+               zs->zs_vdev_aux = 0;
                for (;;) {
                        char path[MAXPATHLEN];
                        int c;
                        (void) sprintf(path, ztest_aux_template, zopt_dir,
-                           zopt_pool, aux, ztest_shared->zs_vdev_aux);
+                           zopt_pool, aux, zs->zs_vdev_aux);
                        for (c = 0; c < sav->sav_count; c++)
                                if (strcmp(sav->sav_vdevs[c]->vdev_path,
                                    path) == 0)
@@ -931,7 +2253,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
                        if (c == sav->sav_count &&
                            vdev_lookup_by_path(rvd, path) == NULL)
                                break;
-                       ztest_shared->zs_vdev_aux++;
+                       zs->zs_vdev_aux++;
                }
        }
 
@@ -961,21 +2283,119 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
                        fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
        }
 
-       (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+       VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+}
+
+/*
+ * split a pool if it has mirror tlvdevs
+ */
+/* ARGSUSED */
+void
+ztest_split_pool(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
+       nvlist_t *tree, **child, *config, *split, **schild;
+       uint_t c, children, schildren = 0, lastlogid = 0;
+       int error = 0;
+
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+
+       /* ensure we have a useable config; mirrors of raidz aren't supported */
+       if (zs->zs_mirrors < 3 || zopt_raidz > 1) {
+               VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+               return;
+       }
+
+       /* clean up the old pool, if any */
+       (void) spa_destroy("splitp");
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       /* generate a config from the existing config */
+       mutex_enter(&spa->spa_props_lock);
+       VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
+           &tree) == 0);
+       mutex_exit(&spa->spa_props_lock);
+
+       VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+           &children) == 0);
+
+       schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
+       for (c = 0; c < children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+               nvlist_t **mchild;
+               uint_t mchildren;
+
+               if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
+                       VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
+                           0) == 0);
+                       VERIFY(nvlist_add_string(schild[schildren],
+                           ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
+                       VERIFY(nvlist_add_uint64(schild[schildren],
+                           ZPOOL_CONFIG_IS_HOLE, 1) == 0);
+                       if (lastlogid == 0)
+                               lastlogid = schildren;
+                       ++schildren;
+                       continue;
+               }
+               lastlogid = 0;
+               VERIFY(nvlist_lookup_nvlist_array(child[c],
+                   ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+               VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
+       }
+
+       /* OK, create a config that can be used to split */
+       VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
+       VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
+           VDEV_TYPE_ROOT) == 0);
+       VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
+           lastlogid != 0 ? lastlogid : schildren) == 0);
+
+       VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
+
+       for (c = 0; c < schildren; c++)
+               nvlist_free(schild[c]);
+       free(schild);
+       nvlist_free(split);
+
+       spa_config_exit(spa, SCL_VDEV, FTAG);
+
+       (void) rw_wrlock(&zs->zs_name_lock);
+       error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
+       (void) rw_unlock(&zs->zs_name_lock);
+
+       nvlist_free(config);
+
+       if (error == 0) {
+               (void) printf("successful split - results:\n");
+               mutex_enter(&spa_namespace_lock);
+               show_pool_stats(spa);
+               show_pool_stats(spa_lookup("splitp"));
+               mutex_exit(&spa_namespace_lock);
+               ++zs->zs_splits;
+               --zs->zs_mirrors;
+       }
+       VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
+/* ARGSUSED */
 void
-ztest_vdev_attach_detach(ztest_args_t *za)
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 {
-       spa_t *spa = za->za_spa;
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
        spa_aux_vdev_t *sav = &spa->spa_spares;
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *oldvd, *newvd, *pvd;
        nvlist_t *root;
-       uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+       uint64_t leaves;
        uint64_t leaf, top;
        uint64_t ashift = ztest_get_ashift();
        uint64_t oldguid, pguid;
@@ -987,7 +2407,8 @@ ztest_vdev_attach_detach(ztest_args_t *za)
        int oldvd_is_log;
        int error, expected_error;
 
-       (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+       leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
 
        spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -999,7 +2420,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
        /*
         * Pick a random top-level vdev.
         */
-       top = ztest_random(rvd->vdev_children);
+       top = ztest_random_vdev_top(spa, B_TRUE);
 
        /*
         * Pick a random leaf within it.
@@ -1010,9 +2431,9 @@ ztest_vdev_attach_detach(ztest_args_t *za)
         * Locate this vdev.
         */
        oldvd = rvd->vdev_child[top];
-       if (zopt_mirrors >= 1) {
+       if (zs->zs_mirrors >= 1) {
                ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
-               ASSERT(oldvd->vdev_children >= zopt_mirrors);
+               ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
                oldvd = oldvd->vdev_child[leaf / zopt_raidz];
        }
        if (zopt_raidz > 1) {
@@ -1047,7 +2468,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
                if (error != 0 && error != ENODEV && error != EBUSY &&
                    error != ENOTSUP)
                        fatal(0, "detach (%s) returned %d", oldpath, error);
-               (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+               VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
                return;
        }
 
@@ -1140,7 +2561,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
                    (longlong_t)newsize, replacing, error, expected_error);
        }
 
-       (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+       VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 }
 
 /*
@@ -1180,27 +2601,48 @@ online_vdev(vdev_t *vd, void *arg)
 {
        spa_t *spa = vd->vdev_spa;
        vdev_t *tvd = vd->vdev_top;
-       vdev_t *pvd = vd->vdev_parent;
        uint64_t guid = vd->vdev_guid;
+       uint64_t generation = spa->spa_config_generation + 1;
+       vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+       int error;
 
        ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
        ASSERT(vd->vdev_ops->vdev_op_leaf);
 
        /* Calling vdev_online will initialize the new metaslabs */
        spa_config_exit(spa, SCL_STATE, spa);
-       (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+       error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
        spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
        /*
+        * If vdev_online returned an error or the underlying vdev_open
+        * failed then we abort the expand. The only way to know that
+        * vdev_open fails is by checking the returned newstate.
+        */
+       if (error || newstate != VDEV_STATE_HEALTHY) {
+               if (zopt_verbose >= 5) {
+                       (void) printf("Unable to expand vdev, state %llu, "
+                           "error %d\n", (u_longlong_t)newstate, error);
+               }
+               return (vd);
+       }
+       ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
+
+       /*
         * Since we dropped the lock we need to ensure that we're
         * still talking to the original vdev. It's possible this
         * vdev may have been detached/replaced while we were
         * trying to online it.
         */
-       if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) {
-               if (zopt_verbose >= 6) {
-                       (void) printf("vdev %p has disappeared, was "
-                           "guid %llu\n", (void *)vd, (u_longlong_t)guid);
+       if (generation != spa->spa_config_generation) {
+               if (zopt_verbose >= 5) {
+                       (void) printf("vdev configuration has changed, "
+                           "guid %llu, state %llu, expected gen %llu, "
+                           "got gen %llu\n",
+                           (u_longlong_t)guid,
+                           (u_longlong_t)tvd->vdev_state,
+                           (u_longlong_t)generation,
+                           (u_longlong_t)spa->spa_config_generation);
                }
                return (vd);
        }
@@ -1235,24 +2677,29 @@ vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
 /*
  * Verify that dynamic LUN growth works as expected.
  */
+/* ARGSUSED */
 void
-ztest_vdev_LUN_growth(ztest_args_t *za)
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 {
-       spa_t *spa = za->za_spa;
-       vdev_t *vd, *tvd = NULL;
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       vdev_t *vd, *tvd;
+       metaslab_class_t *mc;
+       metaslab_group_t *mg;
        size_t psize, newsize;
-       uint64_t spa_newsize, spa_cursize, ms_count;
+       uint64_t top;
+       uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
 
-       (void) mutex_lock(&ztest_shared->zs_vdev_lock);
-       mutex_enter(&spa_namespace_lock);
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
        spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
-       while (tvd == NULL || tvd->vdev_islog) {
-               uint64_t vdev;
+       top = ztest_random_vdev_top(spa, B_TRUE);
 
-               vdev = ztest_random(spa->spa_root_vdev->vdev_children);
-               tvd = spa->spa_root_vdev->vdev_child[vdev];
-       }
+       tvd = spa->spa_root_vdev->vdev_child[top];
+       mg = tvd->vdev_mg;
+       mc = mg->mg_class;
+       old_ms_count = tvd->vdev_ms_count;
+       old_class_space = metaslab_class_get_space(mc);
 
        /*
         * Determine the size of the first leaf vdev associated with
@@ -1265,13 +2712,13 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
        psize = vd->vdev_psize;
 
        /*
-        * We only try to expand the vdev if it's less than 4x its
-        * original size and it has a valid psize.
+        * We only try to expand the vdev if it's healthy, less than 4x its
+        * original size, and it has a valid psize.
         */
-       if (psize == 0 || psize >= 4 * zopt_vdev_size) {
+       if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+           psize == 0 || psize >= 4 * zopt_vdev_size) {
                spa_config_exit(spa, SCL_STATE, spa);
-               mutex_exit(&spa_namespace_lock);
-               (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+               VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
                return;
        }
        ASSERT(psize > 0);
@@ -1279,13 +2726,10 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
        ASSERT3U(newsize, >, psize);
 
        if (zopt_verbose >= 6) {
-               (void) printf("Expanding vdev %s from %lu to %lu\n",
+               (void) printf("Expanding LUN %s from %lu to %lu\n",
                    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
        }
 
-       spa_cursize = spa_get_space(spa);
-       ms_count = tvd->vdev_ms_count;
-
        /*
         * Growing the vdev is a two step process:
         *      1). expand the physical size (i.e. relabel)
@@ -1296,166 +2740,194 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
            tvd->vdev_state != VDEV_STATE_HEALTHY) {
                if (zopt_verbose >= 5) {
                        (void) printf("Could not expand LUN because "
-                           "some vdevs were not healthy\n");
+                           "the vdev configuration changed.\n");
                }
-               (void) spa_config_exit(spa, SCL_STATE, spa);
-               mutex_exit(&spa_namespace_lock);
-               (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+               spa_config_exit(spa, SCL_STATE, spa);
+               VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
                return;
        }
 
-       (void) spa_config_exit(spa, SCL_STATE, spa);
-       mutex_exit(&spa_namespace_lock);
+       spa_config_exit(spa, SCL_STATE, spa);
 
        /*
         * Expanding the LUN will update the config asynchronously,
         * thus we must wait for the async thread to complete any
         * pending tasks before proceeding.
         */
-       mutex_enter(&spa->spa_async_lock);
-       while (spa->spa_async_thread != NULL || spa->spa_async_tasks)
-               cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
-       mutex_exit(&spa->spa_async_lock);
+       for (;;) {
+               boolean_t done;
+               mutex_enter(&spa->spa_async_lock);
+               done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+               mutex_exit(&spa->spa_async_lock);
+               if (done)
+                       break;
+               txg_wait_synced(spa_get_dsl(spa), 0);
+               (void) poll(NULL, 0, 100);
+       }
 
        spa_config_enter(spa, SCL_STATE, spa, RW_READER);
-       spa_newsize = spa_get_space(spa);
+
+       tvd = spa->spa_root_vdev->vdev_child[top];
+       new_ms_count = tvd->vdev_ms_count;
+       new_class_space = metaslab_class_get_space(mc);
+
+       if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+               if (zopt_verbose >= 5) {
+                       (void) printf("Could not verify LUN expansion due to "
+                           "intervening vdev offline or remove.\n");
+               }
+               spa_config_exit(spa, SCL_STATE, spa);
+               VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+               return;
+       }
+
+       /*
+        * Make sure we were able to grow the vdev.
+        */
+       if (new_ms_count <= old_ms_count)
+               fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+                   old_ms_count, new_ms_count);
 
        /*
         * Make sure we were able to grow the pool.
         */
-       if (ms_count >= tvd->vdev_ms_count ||
-           spa_cursize >= spa_newsize) {
-               (void) printf("Top-level vdev metaslab count: "
-                   "before %llu, after %llu\n",
-                   (u_longlong_t)ms_count,
-                   (u_longlong_t)tvd->vdev_ms_count);
-               fatal(0, "LUN expansion failed: before %llu, "
-                   "after %llu\n", spa_cursize, spa_newsize);
-       } else if (zopt_verbose >= 5) {
+       if (new_class_space <= old_class_space)
+               fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+                   old_class_space, new_class_space);
+
+       if (zopt_verbose >= 5) {
                char oldnumbuf[6], newnumbuf[6];
 
-               nicenum(spa_cursize, oldnumbuf);
-               nicenum(spa_newsize, newnumbuf);
+               nicenum(old_class_space, oldnumbuf);
+               nicenum(new_class_space, newnumbuf);
                (void) printf("%s grew from %s to %s\n",
                    spa->spa_name, oldnumbuf, newnumbuf);
        }
+
        spa_config_exit(spa, SCL_STATE, spa);
-       (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+       VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 }
 
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
 /* ARGSUSED */
 static void
-ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
        /*
-        * Create the directory object.
+        * Create the objects common to all ztest datasets.
         */
-       VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
-           DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
-           DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
-
-       VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
+       VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
            DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
 
-       VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
-           DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+static int
+ztest_dataset_create(char *dsname)
+{
+       uint64_t zilset = ztest_random(100);
+       int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
+           ztest_objset_create_cb, NULL);
+
+       if (err || zilset < 80)
+               return (err);
+
+       (void) printf("Setting dataset %s to sync always\n", dsname);
+       return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
+           ZFS_SYNC_ALWAYS, B_FALSE));
 }
 
+/* ARGSUSED */
 static int
-ztest_destroy_cb(char *name, void *arg)
+ztest_objset_destroy_cb(const char *name, void *arg)
 {
-       ztest_args_t *za = arg;
        objset_t *os;
-       dmu_object_info_t *doi = &za->za_doi;
+       dmu_object_info_t doi;
        int error;
 
        /*
         * Verify that the dataset contains a directory object.
         */
-       error = dmu_objset_open(name, DMU_OST_OTHER,
-           DS_MODE_USER | DS_MODE_READONLY, &os);
-       ASSERT3U(error, ==, 0);
-       error = dmu_object_info(os, ZTEST_DIROBJ, doi);
+       VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+       error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
        if (error != ENOENT) {
                /* We could have crashed in the middle of destroying it */
                ASSERT3U(error, ==, 0);
-               ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
-               ASSERT3S(doi->doi_physical_blks, >=, 0);
+               ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+               ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
        }
-       dmu_objset_close(os);
+       dmu_objset_rele(os, FTAG);
 
        /*
         * Destroy the dataset.
         */
-       error = dmu_objset_destroy(name, B_FALSE);
-       if (error) {
-               (void) dmu_objset_open(name, DMU_OST_OTHER,
-                   DS_MODE_USER | DS_MODE_READONLY, &os);
-               fatal(0, "dmu_objset_destroy(os=%p) = %d\n", &os, error);
-       }
+       VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
        return (0);
 }
 
-/*
- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
- */
-static uint64_t
-ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
 {
-       itx_t *itx;
-       lr_create_t *lr;
-       size_t namesize;
-       char name[24];
-
-       (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
-       namesize = strlen(name) + 1;
-
-       itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
-           ztest_random(ZIL_MAX_BLKSZ));
-       lr = (lr_create_t *)&itx->itx_lr;
-       bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
-       lr->lr_doid = object;
-       lr->lr_foid = 0;
-       lr->lr_mode = mode;
-       lr->lr_uid = 0;
-       lr->lr_gid = 0;
-       lr->lr_gen = dmu_tx_get_txg(tx);
-       lr->lr_crtime[0] = time(NULL);
-       lr->lr_crtime[1] = 0;
-       lr->lr_rdev = 0;
-       bcopy(name, (char *)(lr + 1), namesize);
-
-       return (zil_itx_assign(zilog, itx, tx));
+       char snapname[MAXNAMELEN];
+       int error;
+
+       (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+           (u_longlong_t)id);
+
+       error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+           NULL, B_FALSE);
+       if (error == ENOSPC) {
+               ztest_record_enospc(FTAG);
+               return (B_FALSE);
+       }
+       if (error != 0 && error != EEXIST)
+               fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+       return (B_TRUE);
+}
+
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+       char snapname[MAXNAMELEN];
+       int error;
+
+       (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+           (u_longlong_t)id);
+
+       error = dmu_objset_destroy(snapname, B_FALSE);
+       if (error != 0 && error != ENOENT)
+               fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+       return (B_TRUE);
 }
 
+/* ARGSUSED */
 void
-ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
+       ztest_shared_t *zs = ztest_shared;
+       ztest_ds_t zdtmp;
+       int iters;
        int error;
        objset_t *os, *os2;
-       char name[100];
-       int basemode, expected_error;
+       char name[MAXNAMELEN];
        zilog_t *zilog;
-       uint64_t seq;
-       uint64_t objects;
 
-       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-       (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
-           (u_longlong_t)za->za_instance);
+       (void) rw_rdlock(&zs->zs_name_lock);
 
-       basemode = DS_MODE_TYPE(za->za_instance);
-       if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
-               basemode = DS_MODE_USER;
+       (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+           zs->zs_pool, (u_longlong_t)id);
 
        /*
         * If this dataset exists from a previous run, process its replay log
         * half of the time.  If we don't replay it, then dmu_objset_destroy()
-        * (invoked from ztest_destroy_cb() below) should just throw it away.
+        * (invoked from ztest_objset_destroy_cb()) should just throw it away.
         */
        if (ztest_random(2) == 0 &&
-           dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
-               zil_replay(os, os, ztest_replay_vector);
-               dmu_objset_close(os);
+           dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
+               ztest_zd_init(&zdtmp, os);
+               zil_replay(os, &zdtmp, ztest_replay_vector);
+               ztest_zd_fini(&zdtmp);
+               dmu_objset_disown(os, FTAG);
        }
 
        /*
@@ -1463,156 +2935,105 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
         * create lying around from a previous run.  If so, destroy it
         * and all of its snapshots.
         */
-       (void) dmu_objset_find(name, ztest_destroy_cb, za,
+       (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
            DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
        /*
         * Verify that the destroyed dataset is no longer in the namespace.
         */
-       error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
-       if (error != ENOENT)
-               fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
-                   name, os);
+       VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
 
        /*
         * Verify that we can create a new dataset.
         */
-       error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
-           ztest_create_cb, NULL);
+       error = ztest_dataset_create(name);
        if (error) {
                if (error == ENOSPC) {
-                       ztest_record_enospc("dmu_objset_create");
-                       (void) rw_unlock(&ztest_shared->zs_name_lock);
+                       ztest_record_enospc(FTAG);
+                       (void) rw_unlock(&zs->zs_name_lock);
                        return;
                }
                fatal(0, "dmu_objset_create(%s) = %d", name, error);
        }
 
-       error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
-       if (error) {
-               fatal(0, "dmu_objset_open(%s) = %d", name, error);
-       }
+       VERIFY3U(0, ==,
+           dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+
+       ztest_zd_init(&zdtmp, os);
 
        /*
         * Open the intent log for it.
         */
-       zilog = zil_open(os, NULL);
+       zilog = zil_open(os, ztest_get_data);
 
        /*
-        * Put a random number of objects in there.
+        * Put some objects in there, do a little I/O to them,
+        * and randomly take a couple of snapshots along the way.
         */
-       objects = ztest_random(20);
-       seq = 0;
-       while (objects-- != 0) {
-               uint64_t object;
-               dmu_tx_t *tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       dmu_tx_abort(tx);
-               } else {
-                       object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                           DMU_OT_NONE, 0, tx);
-                       ztest_set_random_blocksize(os, object, tx);
-                       seq = ztest_log_create(zilog, tx, object,
-                           DMU_OT_UINT64_OTHER);
-                       dmu_write(os, object, 0, sizeof (name), name, tx);
-                       dmu_tx_commit(tx);
-               }
-               if (ztest_random(5) == 0) {
-                       zil_commit(zilog, seq, object);
-               }
-               if (ztest_random(100) == 0) {
-                       error = zil_suspend(zilog);
-                       if (error == 0) {
-                               zil_resume(zilog);
-                       }
-               }
+       iters = ztest_random(5);
+       for (int i = 0; i < iters; i++) {
+               ztest_dmu_object_alloc_free(&zdtmp, id);
+               if (ztest_random(iters) == 0)
+                       (void) ztest_snapshot_create(name, i);
        }
 
        /*
         * Verify that we cannot create an existing dataset.
         */
-       error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, NULL, NULL);
-       if (error != EEXIST)
-               fatal(0, "created existing dataset, error = %d", error);
+       VERIFY3U(EEXIST, ==,
+           dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
 
        /*
-        * Verify that multiple dataset holds are allowed, but only when
-        * the new access mode is compatible with the base mode.
+        * Verify that we can hold an objset that is also owned.
         */
-       if (basemode == DS_MODE_OWNER) {
-               error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_USER,
-                   &os2);
-               if (error)
-                       fatal(0, "dmu_objset_open('%s') = %d", name, error);
-               else
-                       dmu_objset_close(os2);
-       }
-       error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os2);
-       expected_error = (basemode == DS_MODE_OWNER) ? EBUSY : 0;
-       if (error != expected_error)
-               fatal(0, "dmu_objset_open('%s') = %d, expected %d",
-                   name, error, expected_error);
-       if (error == 0)
-               dmu_objset_close(os2);
+       VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
+       dmu_objset_rele(os2, FTAG);
 
-       zil_close(zilog);
-       dmu_objset_close(os);
+       /*
+        * Verify that we cannot own an objset that is already owned.
+        */
+       VERIFY3U(EBUSY, ==,
+           dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
 
-       error = dmu_objset_destroy(name, B_FALSE);
-       if (error)
-               fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
+       zil_close(zilog);
+       dmu_objset_disown(os, FTAG);
+       ztest_zd_fini(&zdtmp);
 
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
+       (void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
-ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-       int error;
-       objset_t *os = za->za_os;
-       char snapname[100];
-       char osname[MAXNAMELEN];
-
-       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-       dmu_objset_name(os, osname);
-       (void) snprintf(snapname, 100, "%s@%llu", osname,
-           (u_longlong_t)za->za_instance);
+       ztest_shared_t *zs = ztest_shared;
 
-       error = dmu_objset_destroy(snapname, B_FALSE);
-       if (error != 0 && error != ENOENT)
-               fatal(0, "dmu_objset_destroy() = %d", error);
-       error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
-           NULL, FALSE);
-       if (error == ENOSPC)
-               ztest_record_enospc("dmu_take_snapshot");
-       else if (error != 0 && error != EEXIST)
-               fatal(0, "dmu_take_snapshot() = %d", error);
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
+       (void) rw_rdlock(&zs->zs_name_lock);
+       (void) ztest_snapshot_destroy(zd->zd_name, id);
+       (void) ztest_snapshot_create(zd->zd_name, id);
+       (void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Cleanup non-standard snapshots and clones.
  */
 void
-ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
 {
-       char snap1name[100];
-       char clone1name[100];
-       char snap2name[100];
-       char clone2name[100];
-       char snap3name[100];
+       char snap1name[MAXNAMELEN];
+       char clone1name[MAXNAMELEN];
+       char snap2name[MAXNAMELEN];
+       char clone2name[MAXNAMELEN];
+       char snap3name[MAXNAMELEN];
        int error;
 
-       (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
-       (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
-       (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
-       (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
-       (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+       (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+       (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+       (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+       (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+       (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
        error = dmu_objset_destroy(clone2name, B_FALSE);
        if (error && error != ENOENT)
@@ -1635,343 +3056,140 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
  * Verify dsl_dataset_promote handles EBUSY
  */
 void
-ztest_dsl_dataset_promote_busy(ztest_args_t *za)
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
-       int error;
-       objset_t *os = za->za_os;
+       ztest_shared_t *zs = ztest_shared;
        objset_t *clone;
        dsl_dataset_t *ds;
-       char snap1name[100];
-       char clone1name[100];
-       char snap2name[100];
-       char clone2name[100];
-       char snap3name[100];
-       char osname[MAXNAMELEN];
-       uint64_t curval = za->za_instance;
+       char snap1name[MAXNAMELEN];
+       char clone1name[MAXNAMELEN];
+       char snap2name[MAXNAMELEN];
+       char clone2name[MAXNAMELEN];
+       char snap3name[MAXNAMELEN];
+       char *osname = zd->zd_name;
+       int error;
 
-       (void) rw_rdlock(&ztest_shared->zs_name_lock);
+       (void) rw_rdlock(&zs->zs_name_lock);
 
-       dmu_objset_name(os, osname);
-       ztest_dsl_dataset_cleanup(osname, curval);
+       ztest_dsl_dataset_cleanup(osname, id);
 
-       (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
-       (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
-       (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
-       (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
-       (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+       (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+       (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+       (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+       (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+       (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
        error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
-           NULL, FALSE);
+           NULL, B_FALSE);
        if (error && error != EEXIST) {
                if (error == ENOSPC) {
-                       ztest_record_enospc("dmu_take_snapshot");
+                       ztest_record_enospc(FTAG);
                        goto out;
                }
                fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
        }
 
-       error = dmu_objset_open(snap1name, DMU_OST_OTHER,
-           DS_MODE_USER | DS_MODE_READONLY, &clone);
+       error = dmu_objset_hold(snap1name, FTAG, &clone);
        if (error)
                fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
 
-       error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0,
-           NULL, NULL);
-       dmu_objset_close(clone);
+       error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
+       dmu_objset_rele(clone, FTAG);
        if (error) {
                if (error == ENOSPC) {
-                       ztest_record_enospc("dmu_objset_create");
+                       ztest_record_enospc(FTAG);
                        goto out;
                }
                fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
        }
 
-       error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
-           NULL, FALSE);
-       if (error && error != EEXIST) {
-               if (error == ENOSPC) {
-                       ztest_record_enospc("dmu_take_snapshot");
-                       goto out;
-               }
-               fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
-       }
-
-       error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
-           NULL, FALSE);
-       if (error && error != EEXIST) {
-               if (error == ENOSPC) {
-                       ztest_record_enospc("dmu_take_snapshot");
-                       goto out;
-               }
-               fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
-       }
-
-       error = dmu_objset_open(snap3name, DMU_OST_OTHER,
-           DS_MODE_USER | DS_MODE_READONLY, &clone);
-       if (error)
-               fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
-
-       error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0,
-           NULL, NULL);
-       dmu_objset_close(clone);
-       if (error) {
-               if (error == ENOSPC) {
-                       ztest_record_enospc("dmu_objset_create");
-                       goto out;
-               }
-               fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
-       }
-
-       error = dsl_dataset_own(snap1name, DS_MODE_READONLY, FTAG, &ds);
-       if (error)
-               fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
-       error = dsl_dataset_promote(clone2name);
-       if (error != EBUSY)
-               fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
-                   error);
-       dsl_dataset_disown(ds, FTAG);
-
-out:
-       ztest_dsl_dataset_cleanup(osname, curval);
-
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
-}
-
-/*
- * Verify that dmu_object_{alloc,free} work as expected.
- */
-void
-ztest_dmu_object_alloc_free(ztest_args_t *za)
-{
-       objset_t *os = za->za_os;
-       dmu_buf_t *db;
-       dmu_tx_t *tx;
-       uint64_t batchobj, object, batchsize, endoff, temp;
-       int b, c, error, bonuslen;
-       dmu_object_info_t *doi = &za->za_doi;
-       char osname[MAXNAMELEN];
-
-       dmu_objset_name(os, osname);
-
-       endoff = -8ULL;
-       batchsize = 2;
-
-       /*
-        * Create a batch object if necessary, and record it in the directory.
-        */
-       VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-           sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
-       if (batchobj == 0) {
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-                   sizeof (uint64_t));
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create a batch object");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               ztest_set_random_blocksize(os, batchobj, tx);
-               dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-                   sizeof (uint64_t), &batchobj, tx);
-               dmu_tx_commit(tx);
-       }
-
-       /*
-        * Destroy the previous batch of objects.
-        */
-       for (b = 0; b < batchsize; b++) {
-               VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-               if (object == 0)
-                       continue;
-               /*
-                * Read and validate contents.
-                * We expect the nth byte of the bonus buffer to be n.
-                */
-               VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
-               za->za_dbuf = db;
-
-               dmu_object_info_from_db(db, doi);
-               ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
-               ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
-               ASSERT3S(doi->doi_physical_blks, >=, 0);
-
-               bonuslen = doi->doi_bonus_size;
-
-               for (c = 0; c < bonuslen; c++) {
-                       if (((uint8_t *)db->db_data)[c] !=
-                           (uint8_t)(c + bonuslen)) {
-                               fatal(0,
-                                   "bad bonus: %s, obj %llu, off %d: %u != %u",
-                                   osname, object, c,
-                                   ((uint8_t *)db->db_data)[c],
-                                   (uint8_t)(c + bonuslen));
-                       }
-               }
-
-               dmu_buf_rele(db, FTAG);
-               za->za_dbuf = NULL;
-
-               /*
-                * We expect the word at endoff to be our object number.
-                */
-               VERIFY(0 == dmu_read(os, object, endoff,
-                   sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
-
-               if (temp != object) {
-                       fatal(0, "bad data in %s, got %llu, expected %llu",
-                           osname, temp, object);
-               }
-
-               /*
-                * Destroy old object and clear batch entry.
-                */
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, batchobj,
-                   b * sizeof (uint64_t), sizeof (uint64_t));
-               dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("free object");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               error = dmu_object_free(os, object, tx);
-               if (error) {
-                       fatal(0, "dmu_object_free('%s', %llu) = %d",
-                           osname, object, error);
-               }
-               object = 0;
-
-               dmu_object_set_checksum(os, batchobj,
-                   ztest_random_checksum(), tx);
-               dmu_object_set_compress(os, batchobj,
-                   ztest_random_compress(), tx);
-
-               dmu_write(os, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t), &object, tx);
-
-               dmu_tx_commit(tx);
-       }
-
-       /*
-        * Before creating the new batch of objects, generate a bunch of churn.
-        */
-       for (b = ztest_random(100); b > 0; b--) {
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("churn objects");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               ztest_set_random_blocksize(os, object, tx);
-               error = dmu_object_free(os, object, tx);
-               if (error) {
-                       fatal(0, "dmu_object_free('%s', %llu) = %d",
-                           osname, object, error);
-               }
-               dmu_tx_commit(tx);
-       }
-
-       /*
-        * Create a new batch of objects with randomly chosen
-        * blocksizes and record them in the batch directory.
-        */
-       for (b = 0; b < batchsize; b++) {
-               uint32_t va_blksize;
-               u_longlong_t va_nblocks;
-
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t));
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
-                   sizeof (uint64_t));
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create batchobj");
-                       dmu_tx_abort(tx);
-                       return;
+       error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
+           NULL, B_FALSE);
+       if (error && error != EEXIST) {
+               if (error == ENOSPC) {
+                       ztest_record_enospc(FTAG);
+                       goto out;
                }
-               bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-
-               object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_PLAIN_OTHER, bonuslen, tx);
+               fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
+       }
 
-               ztest_set_random_blocksize(os, object, tx);
+       error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
+           NULL, B_FALSE);
+       if (error && error != EEXIST) {
+               if (error == ENOSPC) {
+                       ztest_record_enospc(FTAG);
+                       goto out;
+               }
+               fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
+       }
 
-               dmu_object_set_checksum(os, object,
-                   ztest_random_checksum(), tx);
-               dmu_object_set_compress(os, object,
-                   ztest_random_compress(), tx);
+       error = dmu_objset_hold(snap3name, FTAG, &clone);
+       if (error)
+               fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 
-               dmu_write(os, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t), &object, tx);
+       error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
+       dmu_objset_rele(clone, FTAG);
+       if (error) {
+               if (error == ENOSPC) {
+                       ztest_record_enospc(FTAG);
+                       goto out;
+               }
+               fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
+       }
 
-               /*
-                * Write to both the bonus buffer and the regular data.
-                */
-               VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
-               za->za_dbuf = db;
-               ASSERT3U(bonuslen, <=, db->db_size);
+       error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
+       if (error)
+               fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
+       error = dsl_dataset_promote(clone2name, NULL);
+       if (error != EBUSY)
+               fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
+                   error);
+       dsl_dataset_disown(ds, FTAG);
 
-               dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
-               ASSERT3S(va_nblocks, >=, 0);
+out:
+       ztest_dsl_dataset_cleanup(osname, id);
 
-               dmu_buf_will_dirty(db, tx);
+       (void) rw_unlock(&zs->zs_name_lock);
+}
 
-               /*
-                * See comments above regarding the contents of
-                * the bonus buffer and the word at endoff.
-                */
-               for (c = 0; c < bonuslen; c++)
-                       ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_od_t od[4];
+       int batchsize = sizeof (od) / sizeof (od[0]);
 
-               dmu_buf_rele(db, FTAG);
-               za->za_dbuf = NULL;
+       for (int b = 0; b < batchsize; b++)
+               ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
 
-               /*
-                * Write to a large offset to increase indirection.
-                */
-               dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+       /*
+        * Destroy the previous batch of objects, create a new batch,
+        * and do some I/O on the new objects.
+        */
+       if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+               return;
 
-               dmu_tx_commit(tx);
-       }
+       while (ztest_random(4 * batchsize) != 0)
+               ztest_io(zd, od[ztest_random(batchsize)].od_object,
+                   ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 }
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
-typedef struct bufwad {
-       uint64_t        bw_index;
-       uint64_t        bw_txg;
-       uint64_t        bw_data;
-} bufwad_t;
-
-typedef struct dmu_read_write_dir {
-       uint64_t        dd_packobj;
-       uint64_t        dd_bigobj;
-       uint64_t        dd_chunk;
-} dmu_read_write_dir_t;
-
 void
-ztest_dmu_read_write(ztest_args_t *za)
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *os = za->za_os;
-       dmu_read_write_dir_t dd;
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[2];
        dmu_tx_t *tx;
        int i, freeit, error;
        uint64_t n, s, txg;
        bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
-       uint64_t packoff, packsize, bigoff, bigsize;
+       uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+       uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
        uint64_t regions = 997;
        uint64_t stride = 123456789ULL;
        uint64_t width = 40;
@@ -2004,34 +3222,16 @@ ztest_dmu_read_write(ztest_args_t *za)
        /*
         * Read the directory info.  If it's the first time, set things up.
         */
-       VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-           sizeof (dd), &dd, DMU_READ_PREFETCH));
-       if (dd.dd_chunk == 0) {
-               ASSERT(dd.dd_packobj == 0);
-               ASSERT(dd.dd_bigobj == 0);
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create r/w directory");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-
-               dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
+       ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
 
-               ztest_set_random_blocksize(os, dd.dd_packobj, tx);
-               ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
 
-               dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
-                   tx);
-               dmu_tx_commit(tx);
-       }
+       bigobj = od[0].od_object;
+       packobj = od[1].od_object;
+       chunksize = od[0].od_gen;
+       ASSERT(chunksize == od[1].od_gen);
 
        /*
         * Prefetch a random chunk of the big object.
@@ -2041,7 +3241,7 @@ ztest_dmu_read_write(ztest_args_t *za)
         */
        n = ztest_random(regions) * stride + ztest_random(width);
        s = 1 + ztest_random(2 * width - 1);
-       dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+       dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
 
        /*
         * Pick a random index and compute the offsets into packobj and bigobj.
@@ -2052,8 +3252,8 @@ ztest_dmu_read_write(ztest_args_t *za)
        packoff = n * sizeof (bufwad_t);
        packsize = s * sizeof (bufwad_t);
 
-       bigoff = n * dd.dd_chunk;
-       bigsize = s * dd.dd_chunk;
+       bigoff = n * chunksize;
+       bigsize = s * chunksize;
 
        packbuf = umem_alloc(packsize, UMEM_NOFAIL);
        bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
@@ -2067,10 +3267,10 @@ ztest_dmu_read_write(ztest_args_t *za)
        /*
         * Read the current contents of our objects.
         */
-       error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
+       error = dmu_read(os, packobj, packoff, packsize, packbuf,
            DMU_READ_PREFETCH);
        ASSERT3U(error, ==, 0);
-       error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
+       error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
            DMU_READ_PREFETCH);
        ASSERT3U(error, ==, 0);
 
@@ -2079,24 +3279,25 @@ ztest_dmu_read_write(ztest_args_t *za)
         */
        tx = dmu_tx_create(os);
 
-       dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+       dmu_tx_hold_write(tx, packobj, packoff, packsize);
 
        if (freeit)
-               dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+               dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
        else
-               dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
-       error = dmu_tx_assign(tx, TXG_WAIT);
+               dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
-       if (error) {
-               ztest_record_enospc("dmu r/w range");
-               dmu_tx_abort(tx);
+       txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+       if (txg == 0) {
                umem_free(packbuf, packsize);
                umem_free(bigbuf, bigsize);
                return;
        }
 
-       txg = dmu_tx_get_txg(tx);
+       dmu_object_set_checksum(os, bigobj,
+           (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+
+       dmu_object_set_compress(os, bigobj,
+           (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
 
        /*
         * For each index from n to n + s, verify that the existing bufwad
@@ -2108,9 +3309,9 @@ ztest_dmu_read_write(ztest_args_t *za)
                /* LINTED */
                pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
                /* LINTED */
-               bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+               bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
                /* LINTED */
-               bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+               bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
                ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
                ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2144,27 +3345,26 @@ ztest_dmu_read_write(ztest_args_t *za)
         * We've verified all the old bufwads, and made new ones.
         * Now write them out.
         */
-       dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+       dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 
        if (freeit) {
-               if (zopt_verbose >= 6) {
+               if (zopt_verbose >= 7) {
                        (void) printf("freeing offset %llx size %llx"
                            " txg %llx\n",
                            (u_longlong_t)bigoff,
                            (u_longlong_t)bigsize,
                            (u_longlong_t)txg);
                }
-               VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
-                   bigsize, tx));
+               VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
        } else {
-               if (zopt_verbose >= 6) {
+               if (zopt_verbose >= 7) {
                        (void) printf("writing offset %llx size %llx"
                            " txg %llx\n",
                            (u_longlong_t)bigoff,
                            (u_longlong_t)bigsize,
                            (u_longlong_t)txg);
                }
-               dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+               dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
        }
 
        dmu_tx_commit(tx);
@@ -2176,9 +3376,9 @@ ztest_dmu_read_write(ztest_args_t *za)
                void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
                void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-               VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+               VERIFY(0 == dmu_read(os, packobj, packoff,
                    packsize, packcheck, DMU_READ_PREFETCH));
-               VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+               VERIFY(0 == dmu_read(os, bigobj, bigoff,
                    bigsize, bigcheck, DMU_READ_PREFETCH));
 
                ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2194,7 +3394,7 @@ ztest_dmu_read_write(ztest_args_t *za)
 
 void
 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
-    uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
+    uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
 {
        uint64_t i;
        bufwad_t *pack;
@@ -2211,9 +3411,9 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
                /* LINTED */
                pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
                /* LINTED */
-               bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+               bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
                /* LINTED */
-               bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+               bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
                ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
                ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2242,22 +3442,24 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
 }
 
 void
-ztest_dmu_read_write_zcopy(ztest_args_t *za)
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *os = za->za_os;
-       dmu_read_write_dir_t dd;
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[2];
        dmu_tx_t *tx;
        uint64_t i;
        int error;
        uint64_t n, s, txg;
        bufwad_t *packbuf, *bigbuf;
-       uint64_t packoff, packsize, bigoff, bigsize;
+       uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+       uint64_t blocksize = ztest_random_blocksize();
+       uint64_t chunksize = blocksize;
        uint64_t regions = 997;
        uint64_t stride = 123456789ULL;
        uint64_t width = 9;
        dmu_buf_t *bonus_db;
        arc_buf_t **bigbuf_arcbufs;
-       dmu_object_info_t *doi = &za->za_doi;
+       dmu_object_info_t doi;
 
        /*
         * This test uses two objects, packobj and bigobj, that are always
@@ -2278,42 +3480,22 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
        /*
         * Read the directory info.  If it's the first time, set things up.
         */
-       VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-           sizeof (dd), &dd, DMU_READ_PREFETCH));
-       if (dd.dd_chunk == 0) {
-               ASSERT(dd.dd_packobj == 0);
-               ASSERT(dd.dd_bigobj == 0);
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create r/w directory");
-                       dmu_tx_abort(tx);
-                       return;
-               }
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+       ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
 
-               dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               ztest_set_random_blocksize(os, dd.dd_packobj, tx);
-               ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
 
-               VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
-               ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
-               ASSERT(ISP2(doi->doi_data_block_size));
-               dd.dd_chunk = doi->doi_data_block_size;
+       bigobj = od[0].od_object;
+       packobj = od[1].od_object;
+       blocksize = od[0].od_blocksize;
+       chunksize = blocksize;
+       ASSERT(chunksize == od[1].od_gen);
 
-               dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
-                   tx);
-               dmu_tx_commit(tx);
-       } else {
-               VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
-               VERIFY(ISP2(doi->doi_data_block_size));
-               VERIFY(dd.dd_chunk == doi->doi_data_block_size);
-               VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
-       }
+       VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+       VERIFY(ISP2(doi.doi_data_block_size));
+       VERIFY(chunksize == doi.doi_data_block_size);
+       VERIFY(chunksize >= 2 * sizeof (bufwad_t));
 
        /*
         * Pick a random index and compute the offsets into packobj and bigobj.
@@ -2324,13 +3506,13 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
        packoff = n * sizeof (bufwad_t);
        packsize = s * sizeof (bufwad_t);
 
-       bigoff = n * dd.dd_chunk;
-       bigsize = s * dd.dd_chunk;
+       bigoff = n * chunksize;
+       bigsize = s * chunksize;
 
        packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
        bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
 
-       VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
+       VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
 
        bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
 
@@ -2356,15 +3538,12 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
                for (j = 0; j < s; j++) {
                        if (i != 5) {
                                bigbuf_arcbufs[j] =
-                                   dmu_request_arcbuf(bonus_db,
-                                   dd.dd_chunk);
+                                   dmu_request_arcbuf(bonus_db, chunksize);
                        } else {
                                bigbuf_arcbufs[2 * j] =
-                                   dmu_request_arcbuf(bonus_db,
-                                   dd.dd_chunk / 2);
+                                   dmu_request_arcbuf(bonus_db, chunksize / 2);
                                bigbuf_arcbufs[2 * j + 1] =
-                                   dmu_request_arcbuf(bonus_db,
-                                   dd.dd_chunk / 2);
+                                   dmu_request_arcbuf(bonus_db, chunksize / 2);
                        }
                }
 
@@ -2373,20 +3552,11 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
                 */
                tx = dmu_tx_create(os);
 
-               dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
-               dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
-               if (ztest_random(100) == 0) {
-                       error = -1;
-               } else {
-                       error = dmu_tx_assign(tx, TXG_WAIT);
-               }
+               dmu_tx_hold_write(tx, packobj, packoff, packsize);
+               dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
-               if (error) {
-                       if (error != -1) {
-                               ztest_record_enospc("dmu r/w range");
-                       }
-                       dmu_tx_abort(tx);
+               txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+               if (txg == 0) {
                        umem_free(packbuf, packsize);
                        umem_free(bigbuf, bigsize);
                        for (j = 0; j < s; j++) {
@@ -2404,54 +3574,52 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
                        return;
                }
 
-               txg = dmu_tx_get_txg(tx);
-
                /*
                 * 50% of the time don't read objects in the 1st iteration to
                 * test dmu_assign_arcbuf() for the case when there're no
                 * existing dbufs for the specified offsets.
                 */
                if (i != 0 || ztest_random(2) != 0) {
-                       error = dmu_read(os, dd.dd_packobj, packoff,
+                       error = dmu_read(os, packobj, packoff,
                            packsize, packbuf, DMU_READ_PREFETCH);
                        ASSERT3U(error, ==, 0);
-                       error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
+                       error = dmu_read(os, bigobj, bigoff, bigsize,
                            bigbuf, DMU_READ_PREFETCH);
                        ASSERT3U(error, ==, 0);
                }
                compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
-                   n, dd, txg);
+                   n, chunksize, txg);
 
                /*
                 * We've verified all the old bufwads, and made new ones.
                 * Now write them out.
                 */
-               dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
-               if (zopt_verbose >= 6) {
+               dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+               if (zopt_verbose >= 7) {
                        (void) printf("writing offset %llx size %llx"
                            " txg %llx\n",
                            (u_longlong_t)bigoff,
                            (u_longlong_t)bigsize,
                            (u_longlong_t)txg);
                }
-               for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
+               for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
                        dmu_buf_t *dbt;
                        if (i != 5) {
                                bcopy((caddr_t)bigbuf + (off - bigoff),
-                                   bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
+                                   bigbuf_arcbufs[j]->b_data, chunksize);
                        } else {
                                bcopy((caddr_t)bigbuf + (off - bigoff),
                                    bigbuf_arcbufs[2 * j]->b_data,
-                                   dd.dd_chunk / 2);
+                                   chunksize / 2);
                                bcopy((caddr_t)bigbuf + (off - bigoff) +
-                                   dd.dd_chunk / 2,
+                                   chunksize / 2,
                                    bigbuf_arcbufs[2 * j + 1]->b_data,
-                                   dd.dd_chunk / 2);
+                                   chunksize / 2);
                        }
 
                        if (i == 1) {
-                               VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
-                                   FTAG, &dbt) == 0);
+                               VERIFY(dmu_buf_hold(os, bigobj, off,
+                                   FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
                        }
                        if (i != 5) {
                                dmu_assign_arcbuf(bonus_db, off,
@@ -2460,7 +3628,7 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
                                dmu_assign_arcbuf(bonus_db, off,
                                    bigbuf_arcbufs[2 * j], tx);
                                dmu_assign_arcbuf(bonus_db,
-                                   off + dd.dd_chunk / 2,
+                                   off + chunksize / 2,
                                    bigbuf_arcbufs[2 * j + 1], tx);
                        }
                        if (i == 1) {
@@ -2476,9 +3644,9 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
                        void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
                        void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-                       VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+                       VERIFY(0 == dmu_read(os, packobj, packoff,
                            packsize, packcheck, DMU_READ_PREFETCH));
-                       VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+                       VERIFY(0 == dmu_read(os, bigobj, bigoff,
                            bigsize, bigcheck, DMU_READ_PREFETCH));
 
                        ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2500,256 +3668,60 @@ ztest_dmu_read_write_zcopy(ztest_args_t *za)
        umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
 }
 
+/* ARGSUSED */
 void
-ztest_dmu_check_future_leak(ztest_args_t *za)
-{
-       objset_t *os = za->za_os;
-       dmu_buf_t *db;
-       ztest_block_tag_t *bt;
-       dmu_object_info_t *doi = &za->za_doi;
-
-       /*
-        * Make sure that, if there is a write record in the bonus buffer
-        * of the ZTEST_DIROBJ, that the txg for this record is <= the
-        * last synced txg of the pool.
-        */
-       VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
-       za->za_dbuf = db;
-       VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
-       ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
-       ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
-       ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
-       bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
-       if (bt->bt_objset != 0) {
-               ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
-               ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
-               ASSERT3U(bt->bt_offset, ==, -1ULL);
-               ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
-       }
-       dmu_buf_rele(db, FTAG);
-       za->za_dbuf = NULL;
-}
-
-void
-ztest_dmu_write_parallel(ztest_args_t *za)
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *os = za->za_os;
-       ztest_block_tag_t *rbt = &za->za_rbt;
-       ztest_block_tag_t *wbt = &za->za_wbt;
-       const size_t btsize = sizeof (ztest_block_tag_t);
-       dmu_buf_t *db;
-       int b, error;
-       int bs = ZTEST_DIROBJ_BLOCKSIZE;
-       int do_free = 0;
-       uint64_t off, txg, txg_how;
-       mutex_t *lp;
-       char osname[MAXNAMELEN];
-       char iobuf[SPA_MAXBLOCKSIZE];
-       blkptr_t blk = { 0 };
-       uint64_t blkoff;
-       zbookmark_t zb;
-       dmu_tx_t *tx = dmu_tx_create(os);
-       dmu_buf_t *bonus_db;
-       arc_buf_t *abuf = NULL;
-
-       dmu_objset_name(os, osname);
+       ztest_od_t od[1];
+       uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+           (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 
        /*
-        * Have multiple threads write to large offsets in ZTEST_DIROBJ
-        * to verify that having multiple threads writing to the same object
-        * in parallel doesn't cause any trouble.
+        * Have multiple threads write to large offsets in an object
+        * to verify that parallel writes to an object -- even to the
+        * same blocks within the object -- doesn't cause any trouble.
         */
-       if (ztest_random(4) == 0) {
-               /*
-                * Do the bonus buffer instead of a regular block.
-                * We need a lock to serialize resize vs. others,
-                * so we hash on the objset ID.
-                */
-               b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
-               off = -1ULL;
-               dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
-       } else {
-               b = ztest_random(ZTEST_SYNC_LOCKS);
-               off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
-               if (ztest_random(4) == 0) {
-                       do_free = 1;
-                       dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
-               } else {
-                       dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
-               }
-       }
-
-       if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
-           ztest_random(8) == 0) {
-               VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
-               abuf = dmu_request_arcbuf(bonus_db, bs);
-       }
+       ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
 
-       txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
-       error = dmu_tx_assign(tx, txg_how);
-       if (error) {
-               if (error == ERESTART) {
-                       ASSERT(txg_how == TXG_NOWAIT);
-                       dmu_tx_wait(tx);
-               } else {
-                       ztest_record_enospc("dmu write parallel");
-               }
-               dmu_tx_abort(tx);
-               if (abuf != NULL) {
-                       dmu_return_arcbuf(abuf);
-                       dmu_buf_rele(bonus_db, FTAG);
-               }
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;
-       }
-       txg = dmu_tx_get_txg(tx);
 
-       lp = &ztest_shared->zs_sync_lock[b];
-       (void) mutex_lock(lp);
-
-       wbt->bt_objset = dmu_objset_id(os);
-       wbt->bt_object = ZTEST_DIROBJ;
-       wbt->bt_offset = off;
-       wbt->bt_txg = txg;
-       wbt->bt_thread = za->za_instance;
-       wbt->bt_seq = ztest_shared->zs_seq[b]++;        /* protected by lp */
-
-       /*
-        * Occasionally, write an all-zero block to test the behavior
-        * of blocks that compress into holes.
-        */
-       if (off != -1ULL && ztest_random(8) == 0)
-               bzero(wbt, btsize);
-
-       if (off == -1ULL) {
-               dmu_object_info_t *doi = &za->za_doi;
-               char *dboff;
-
-               VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
-               za->za_dbuf = db;
-               dmu_object_info_from_db(db, doi);
-               ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
-               ASSERT3U(doi->doi_bonus_size, >=, btsize);
-               ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
-               dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
-               bcopy(dboff, rbt, btsize);
-               if (rbt->bt_objset != 0) {
-                       ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
-                       ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
-                       ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-                       ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
-               }
-               if (ztest_random(10) == 0) {
-                       int newsize = (ztest_random(db->db_size /
-                           btsize) + 1) * btsize;
-
-                       ASSERT3U(newsize, >=, btsize);
-                       ASSERT3U(newsize, <=, db->db_size);
-                       VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
-                       dboff = (char *)db->db_data + newsize - btsize;
-               }
-               dmu_buf_will_dirty(db, tx);
-               bcopy(wbt, dboff, btsize);
-               dmu_buf_rele(db, FTAG);
-               za->za_dbuf = NULL;
-       } else if (do_free) {
-               VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
-       } else if (abuf == NULL) {
-               dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
-       } else {
-               bcopy(wbt, abuf->b_data, btsize);
-               dmu_assign_arcbuf(bonus_db, off, abuf, tx);
-               dmu_buf_rele(bonus_db, FTAG);
-       }
-
-       (void) mutex_unlock(lp);
-
-       if (ztest_random(1000) == 0)
-               (void) poll(NULL, 0, 1); /* open dn_notxholds window */
+       while (ztest_random(10) != 0)
+               ztest_io(zd, od[0].od_object, offset);
+}
 
-       dmu_tx_commit(tx);
+void
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_od_t od[1];
+       uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+           (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+       uint64_t count = ztest_random(20) + 1;
+       uint64_t blocksize = ztest_random_blocksize();
+       void *data;
 
-       if (ztest_random(10000) == 0)
-               txg_wait_synced(dmu_objset_pool(os), txg);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 
-       if (off == -1ULL || do_free)
+       if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
                return;
 
-       if (ztest_random(2) != 0)
+       if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
                return;
 
-       /*
-        * dmu_sync() the block we just wrote.
-        */
-       (void) mutex_lock(lp);
-
-       blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
-       error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
-       za->za_dbuf = db;
-       if (error) {
-               (void) mutex_unlock(lp);
-               return;
-       }
-       blkoff = off - blkoff;
-       error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
-       dmu_buf_rele(db, FTAG);
-       za->za_dbuf = NULL;
+       ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
 
-       if (error) {
-               (void) mutex_unlock(lp);
-               return;
-       }
+       data = umem_zalloc(blocksize, UMEM_NOFAIL);
 
-       if (blk.blk_birth == 0) {       /* concurrent free */
-               (void) mutex_unlock(lp);
-               return;
+       while (ztest_random(count) != 0) {
+               uint64_t randoff = offset + (ztest_random(count) * blocksize);
+               if (ztest_write(zd, od[0].od_object, randoff, blocksize,
+                   data) != 0)
+                       break;
+               while (ztest_random(4) != 0)
+                       ztest_io(zd, od[0].od_object, randoff);
        }
 
-       txg_suspend(dmu_objset_pool(os));
-
-       (void) mutex_unlock(lp);
-
-       ASSERT(blk.blk_fill == 1);
-       ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
-       ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
-       ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
-
-       /*
-        * Read the block that dmu_sync() returned to make sure its contents
-        * match what we wrote.  We do this while still txg_suspend()ed
-        * to ensure that the block can't be reused before we read it.
-        */
-       zb.zb_objset = dmu_objset_id(os);
-       zb.zb_object = ZTEST_DIROBJ;
-       zb.zb_level = 0;
-       zb.zb_blkid = off / bs;
-       error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
-           NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
-       ASSERT3U(error, ==, 0);
-
-       txg_resume(dmu_objset_pool(os));
-
-       bcopy(&iobuf[blkoff], rbt, btsize);
-
-       if (rbt->bt_objset == 0)                /* concurrent free */
-               return;
-
-       if (wbt->bt_objset == 0)                /* all-zero overwrite */
-               return;
-
-       ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
-       ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
-       ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-
-       /*
-        * The semantic of dmu_sync() is that we always push the most recent
-        * version of the data, so in the face of concurrent updates we may
-        * see a newer version of the block.  That's OK.
-        */
-       ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
-       if (rbt->bt_thread == wbt->bt_thread)
-               ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
-       else
-               ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
+       umem_free(data, blocksize);
 }
 
 /*
@@ -2760,9 +3732,10 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 #define        ZTEST_ZAP_MAX_PROPS     1000
 
 void
-ztest_zap(ztest_args_t *za)
+ztest_zap(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *os = za->za_os;
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
        uint64_t object;
        uint64_t txg, last_txg;
        uint64_t value[ZTEST_ZAP_MAX_INTS];
@@ -2771,64 +3744,45 @@ ztest_zap(ztest_args_t *za)
        dmu_tx_t *tx;
        char propname[100], txgname[100];
        int error;
-       char osname[MAXNAMELEN];
        char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
-       dmu_objset_name(os, osname);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
 
-       /*
-        * Create a new object if necessary, and record it in the directory.
-        */
-       VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-           sizeof (uint64_t), &object, DMU_READ_PREFETCH));
+       if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+               return;
 
-       if (object == 0) {
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-                   sizeof (uint64_t));
-               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create zap test obj");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
-               if (error) {
-                       fatal(0, "zap_create('%s', %llu) = %d",
-                           osname, object, error);
-               }
-               ASSERT(object != 0);
-               dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-                   sizeof (uint64_t), &object, tx);
-               /*
-                * Generate a known hash collision, and verify that
-                * we can lookup and remove both entries.
-                */
-               for (i = 0; i < 2; i++) {
-                       value[i] = i;
-                       error = zap_add(os, object, hc[i], sizeof (uint64_t),
-                           1, &value[i], tx);
-                       ASSERT3U(error, ==, 0);
-               }
-               for (i = 0; i < 2; i++) {
-                       error = zap_add(os, object, hc[i], sizeof (uint64_t),
-                           1, &value[i], tx);
-                       ASSERT3U(error, ==, EEXIST);
-                       error = zap_length(os, object, hc[i],
-                           &zl_intsize, &zl_ints);
-                       ASSERT3U(error, ==, 0);
-                       ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
-                       ASSERT3U(zl_ints, ==, 1);
-               }
-               for (i = 0; i < 2; i++) {
-                       error = zap_remove(os, object, hc[i], tx);
-                       ASSERT3U(error, ==, 0);
-               }
+       object = od[0].od_object;
 
-               dmu_tx_commit(tx);
+       /*
+        * Generate a known hash collision, and verify that
+        * we can lookup and remove both entries.
+        */
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+       txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+       if (txg == 0)
+               return;
+       for (i = 0; i < 2; i++) {
+               value[i] = i;
+               VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+                   1, &value[i], tx));
+       }
+       for (i = 0; i < 2; i++) {
+               VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+                   sizeof (uint64_t), 1, &value[i], tx));
+               VERIFY3U(0, ==,
+                   zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+               ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+               ASSERT3U(zl_ints, ==, 1);
+       }
+       for (i = 0; i < 2; i++) {
+               VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
        }
+       dmu_tx_commit(tx);
 
+       /*
+        * Generate a buch of random entries.
+        */
        ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
        prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
@@ -2872,14 +3826,10 @@ ztest_zap(ztest_args_t *za)
         * should be txg + object + n.
         */
        tx = dmu_tx_create(os);
-       dmu_tx_hold_zap(tx, object, TRUE, NULL);
-       error = dmu_tx_assign(tx, TXG_WAIT);
-       if (error) {
-               ztest_record_enospc("create zap entry");
-               dmu_tx_abort(tx);
+       dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+       txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+       if (txg == 0)
                return;
-       }
-       txg = dmu_tx_get_txg(tx);
 
        if (last_txg > txg)
                fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
@@ -2887,16 +3837,10 @@ ztest_zap(ztest_args_t *za)
        for (i = 0; i < ints; i++)
                value[i] = txg + object + i;
 
-       error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
-       if (error)
-               fatal(0, "zap_update('%s', %llu, '%s') = %d",
-                   osname, object, txgname, error);
-
-       error = zap_update(os, object, propname, sizeof (uint64_t),
-           ints, value, tx);
-       if (error)
-               fatal(0, "zap_update('%s', %llu, '%s') = %d",
-                   osname, object, propname, error);
+       VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+           1, &txg, tx));
+       VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+           ints, value, tx));
 
        dmu_tx_commit(tx);
 
@@ -2915,60 +3859,78 @@ ztest_zap(ztest_args_t *za)
        ASSERT3U(error, ==, 0);
 
        tx = dmu_tx_create(os);
-       dmu_tx_hold_zap(tx, object, TRUE, NULL);
-       error = dmu_tx_assign(tx, TXG_WAIT);
-       if (error) {
-               ztest_record_enospc("remove zap entry");
-               dmu_tx_abort(tx);
+       dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+       txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+       if (txg == 0)
                return;
-       }
-       error = zap_remove(os, object, txgname, tx);
-       if (error)
-               fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-                   osname, object, txgname, error);
+       VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+       VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+       dmu_tx_commit(tx);
+}
 
-       error = zap_remove(os, object, propname, tx);
-       if (error)
-               fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-                   osname, object, propname, error);
+/*
+ * Testcase to test the upgrading of a microzap to fatzap.
+ */
+void
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
+{
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       uint64_t object, txg;
 
-       dmu_tx_commit(tx);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+       if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+               return;
+
+       object = od[0].od_object;
 
        /*
-        * Once in a while, destroy the object.
+        * Add entries to this ZAP and make sure it spills over
+        * and gets upgraded to a fatzap. Also, since we are adding
+        * 2050 entries we should see ptrtbl growth and leaf-block split.
         */
-       if (ztest_random(1000) != 0)
-               return;
+       for (int i = 0; i < 2050; i++) {
+               char name[MAXNAMELEN];
+               uint64_t value = i;
+               dmu_tx_t *tx;
+               int error;
 
-       tx = dmu_tx_create(os);
-       dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
-       dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-       error = dmu_tx_assign(tx, TXG_WAIT);
-       if (error) {
-               ztest_record_enospc("destroy zap object");
-               dmu_tx_abort(tx);
-               return;
+               (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+                   id, value);
+
+               tx = dmu_tx_create(os);
+               dmu_tx_hold_zap(tx, object, B_TRUE, name);
+               txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+               if (txg == 0)
+                       return;
+               error = zap_add(os, object, name, sizeof (uint64_t), 1,
+                   &value, tx);
+               ASSERT(error == 0 || error == EEXIST);
+               dmu_tx_commit(tx);
        }
-       error = zap_destroy(os, object, tx);
-       if (error)
-               fatal(0, "zap_destroy('%s', %llu) = %d",
-                   osname, object, error);
-       object = 0;
-       dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
-           &object, tx);
-       dmu_tx_commit(tx);
 }
 
+/* ARGSUSED */
 void
-ztest_zap_parallel(ztest_args_t *za)
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *os = za->za_os;
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
        uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
        dmu_tx_t *tx;
        int i, namelen, error;
+       int micro = ztest_random(2);
        char name[20], string_value[20];
        void *data;
 
+       ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
+
+       object = od[0].od_object;
+
        /*
         * Generate a random name of the form 'xxx.....' where each
         * x is a random printable character and the dots are dots.
@@ -2983,12 +3945,7 @@ ztest_zap_parallel(ztest_args_t *za)
                name[i] = '.';
        name[i] = '\0';
 
-       if (ztest_random(2) == 0)
-               object = ZTEST_MICROZAP_OBJ;
-       else
-               object = ZTEST_FATZAP_OBJ;
-
-       if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+       if ((namelen & 1) || micro) {
                wsize = sizeof (txg);
                wc = 1;
                data = &txg;
@@ -3009,14 +3966,10 @@ ztest_zap_parallel(ztest_args_t *za)
 
        if (i >= 2) {
                tx = dmu_tx_create(os);
-               dmu_tx_hold_zap(tx, object, TRUE, NULL);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("zap parallel");
-                       dmu_tx_abort(tx);
+               dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+               txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+               if (txg == 0)
                        return;
-               }
-               txg = dmu_tx_get_txg(tx);
                bcopy(name, string_value, namelen);
        } else {
                tx = NULL;
@@ -3067,79 +4020,397 @@ ztest_zap_parallel(ztest_args_t *za)
                dmu_tx_commit(tx);
 }
 
+/*
+ * Commit callback data.
+ */
+typedef struct ztest_cb_data {
+       list_node_t             zcd_node;
+       uint64_t                zcd_txg;
+       int                     zcd_expected_err;
+       boolean_t               zcd_added;
+       boolean_t               zcd_called;
+       spa_t                   *zcd_spa;
+} ztest_cb_data_t;
+
+/* This is the actual commit callback function */
+static void
+ztest_commit_callback(void *arg, int error)
+{
+       ztest_cb_data_t *data = arg;
+       uint64_t synced_txg;
+
+       VERIFY(data != NULL);
+       VERIFY3S(data->zcd_expected_err, ==, error);
+       VERIFY(!data->zcd_called);
+
+       synced_txg = spa_last_synced_txg(data->zcd_spa);
+       if (data->zcd_txg > synced_txg)
+               fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+                   ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+                   synced_txg);
+
+       data->zcd_called = B_TRUE;
+
+       if (error == ECANCELED) {
+               ASSERT3U(data->zcd_txg, ==, 0);
+               ASSERT(!data->zcd_added);
+
+               /*
+                * The private callback data should be destroyed here, but
+                * since we are going to check the zcd_called field after
+                * dmu_tx_abort(), we will destroy it there.
+                */
+               return;
+       }
+
+       /* Was this callback added to the global callback list? */
+       if (!data->zcd_added)
+               goto out;
+
+       ASSERT3U(data->zcd_txg, !=, 0);
+
+       /* Remove our callback from the list */
+       (void) mutex_lock(&zcl.zcl_callbacks_lock);
+       list_remove(&zcl.zcl_callbacks, data);
+       (void) mutex_unlock(&zcl.zcl_callbacks_lock);
+
+out:
+       umem_free(data, sizeof (ztest_cb_data_t));
+}
+
+/* Allocate and initialize callback data structure */
+static ztest_cb_data_t *
+ztest_create_cb_data(objset_t *os, uint64_t txg)
+{
+       ztest_cb_data_t *cb_data;
+
+       cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+       cb_data->zcd_txg = txg;
+       cb_data->zcd_spa = dmu_objset_spa(os);
+
+       return (cb_data);
+}
+
+/*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+#define        ZTEST_COMMIT_CALLBACK_THRESH    (TXG_CONCURRENT_STATES + 2)
+
+/*
+ * Commit callback test.
+ */
 void
-ztest_dsl_prop_get_set(ztest_args_t *za)
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+{
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       dmu_tx_t *tx;
+       ztest_cb_data_t *cb_data[3], *tmp_cb;
+       uint64_t old_txg, txg;
+       int i, error;
+
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
+
+       tx = dmu_tx_create(os);
+
+       cb_data[0] = ztest_create_cb_data(os, 0);
+       dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+       dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
+
+       /* Every once in a while, abort the transaction on purpose */
+       if (ztest_random(100) == 0)
+               error = -1;
+
+       if (!error)
+               error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+       txg = error ? 0 : dmu_tx_get_txg(tx);
+
+       cb_data[0]->zcd_txg = txg;
+       cb_data[1] = ztest_create_cb_data(os, txg);
+       dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
+       if (error) {
+               /*
+                * It's not a strict requirement to call the registered
+                * callbacks from inside dmu_tx_abort(), but that's what
+                * it's supposed to happen in the current implementation
+                * so we will check for that.
+                */
+               for (i = 0; i < 2; i++) {
+                       cb_data[i]->zcd_expected_err = ECANCELED;
+                       VERIFY(!cb_data[i]->zcd_called);
+               }
+
+               dmu_tx_abort(tx);
+
+               for (i = 0; i < 2; i++) {
+                       VERIFY(cb_data[i]->zcd_called);
+                       umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+               }
+
+               return;
+       }
+
+       cb_data[2] = ztest_create_cb_data(os, txg);
+       dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
+
+       /*
+        * Read existing data to make sure there isn't a future leak.
+        */
+       VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
+           &old_txg, DMU_READ_PREFETCH));
+
+       if (old_txg > txg)
+               fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+                   old_txg, txg);
+
+       dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
+
+       (void) mutex_lock(&zcl.zcl_callbacks_lock);
+
+       /*
+        * Since commit callbacks don't have any ordering requirement and since
+        * it is theoretically possible for a commit callback to be called
+        * after an arbitrary amount of time has elapsed since its txg has been
+        * synced, it is difficult to reliably determine whether a commit
+        * callback hasn't been called due to high load or due to a flawed
+        * implementation.
+        *
+        * In practice, we will assume that if after a certain number of txgs a
+        * commit callback hasn't been called, then most likely there's an
+        * implementation bug..
+        */
+       tmp_cb = list_head(&zcl.zcl_callbacks);
+       if (tmp_cb != NULL &&
+           tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+               fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+                   PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+       }
+
+       /*
+        * Let's find the place to insert our callbacks.
+        *
+        * Even though the list is ordered by txg, it is possible for the
+        * insertion point to not be the end because our txg may already be
+        * quiescing at this point and other callbacks in the open txg
+        * (from other objsets) may have sneaked in.
+        */
+       tmp_cb = list_tail(&zcl.zcl_callbacks);
+       while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+               tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+       /* Add the 3 callbacks to the list */
+       for (i = 0; i < 3; i++) {
+               if (tmp_cb == NULL)
+                       list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+               else
+                       list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+                           cb_data[i]);
+
+               cb_data[i]->zcd_added = B_TRUE;
+               VERIFY(!cb_data[i]->zcd_called);
+
+               tmp_cb = cb_data[i];
+       }
+
+       (void) mutex_unlock(&zcl.zcl_callbacks_lock);
+
+       dmu_tx_commit(tx);
+}
+
+/* ARGSUSED */
+void
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+       zfs_prop_t proplist[] = {
+               ZFS_PROP_CHECKSUM,
+               ZFS_PROP_COMPRESSION,
+               ZFS_PROP_COPIES,
+               ZFS_PROP_DEDUP
+       };
+       ztest_shared_t *zs = ztest_shared;
+
+       (void) rw_rdlock(&zs->zs_name_lock);
+
+       for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+               (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+                   ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+       (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       nvlist_t *props = NULL;
+
+       (void) rw_rdlock(&zs->zs_name_lock);
+
+       (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+           ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+
+       VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+
+       if (zopt_verbose >= 6)
+               dump_nvlist(props, 4);
+
+       nvlist_free(props);
+
+       (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/*
+ * Test snapshot hold/release and deferred destroy.
+ */
+void
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
 {
-       objset_t *os = za->za_os;
-       int i, inherit;
-       uint64_t value;
-       const char *prop, *valname;
-       char setpoint[MAXPATHLEN];
-       char osname[MAXNAMELEN];
        int error;
+       objset_t *os = zd->zd_os;
+       objset_t *origin;
+       char snapname[100];
+       char fullname[100];
+       char clonename[100];
+       char tag[100];
+       char osname[MAXNAMELEN];
 
        (void) rw_rdlock(&ztest_shared->zs_name_lock);
 
        dmu_objset_name(os, osname);
 
-       for (i = 0; i < 2; i++) {
-               if (i == 0) {
-                       prop = "checksum";
-                       value = ztest_random_checksum();
-                       inherit = (value == ZIO_CHECKSUM_INHERIT);
-               } else {
-                       prop = "compression";
-                       value = ztest_random_compress();
-                       inherit = (value == ZIO_COMPRESS_INHERIT);
+       (void) snprintf(snapname, 100, "sh1_%llu", id);
+       (void) snprintf(fullname, 100, "%s@%s", osname, snapname);
+       (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
+       (void) snprintf(tag, 100, "%tag_%llu", id);
+
+       /*
+        * Clean up from any previous run.
+        */
+       (void) dmu_objset_destroy(clonename, B_FALSE);
+       (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+       (void) dmu_objset_destroy(fullname, B_FALSE);
+
+       /*
+        * Create snapshot, clone it, mark snap for deferred destroy,
+        * destroy clone, verify snap was also destroyed.
+        */
+       error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+       if (error) {
+               if (error == ENOSPC) {
+                       ztest_record_enospc("dmu_objset_snapshot");
+                       goto out;
                }
+               fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+       }
 
-               error = dsl_prop_set(osname, prop, sizeof (value),
-                   !inherit, &value);
+       error = dmu_objset_hold(fullname, FTAG, &origin);
+       if (error)
+               fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
 
+       error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
+       dmu_objset_rele(origin, FTAG);
+       if (error) {
                if (error == ENOSPC) {
-                       ztest_record_enospc("dsl_prop_set");
-                       break;
+                       ztest_record_enospc("dmu_objset_clone");
+                       goto out;
                }
+               fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
+       }
 
-               ASSERT3U(error, ==, 0);
+       error = dmu_objset_destroy(fullname, B_TRUE);
+       if (error) {
+               fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+                   fullname, error);
+       }
 
-               VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
-                   1, &value, setpoint), ==, 0);
+       error = dmu_objset_destroy(clonename, B_FALSE);
+       if (error)
+               fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
 
-               if (i == 0)
-                       valname = zio_checksum_table[value].ci_name;
-               else
-                       valname = zio_compress_table[value].ci_name;
+       error = dmu_objset_hold(fullname, FTAG, &origin);
+       if (error != ENOENT)
+               fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
 
-               if (zopt_verbose >= 6) {
-                       (void) printf("%s %s = %s for '%s'\n",
-                           osname, prop, valname, setpoint);
+       /*
+        * Create snapshot, add temporary hold, verify that we can't
+        * destroy a held snapshot, mark for deferred destroy,
+        * release hold, verify snapshot was destroyed.
+        */
+       error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+       if (error) {
+               if (error == ENOSPC) {
+                       ztest_record_enospc("dmu_objset_snapshot");
+                       goto out;
                }
+               fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+       }
+
+       error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE);
+       if (error)
+               fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
+
+       error = dmu_objset_destroy(fullname, B_FALSE);
+       if (error != EBUSY) {
+               fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
+                   fullname, error);
+       }
+
+       error = dmu_objset_destroy(fullname, B_TRUE);
+       if (error) {
+               fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+                   fullname, error);
        }
 
+       error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+       if (error)
+               fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
+
+       VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
+
+out:
        (void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 /*
  * Inject random faults into the on-disk data.
  */
+/* ARGSUSED */
 void
-ztest_fault_inject(ztest_args_t *za)
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 {
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
        int fd;
        uint64_t offset;
-       uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+       uint64_t leaves;
        uint64_t bad = 0x1990c0ffeedecade;
        uint64_t top, leaf;
        char path0[MAXPATHLEN];
        char pathrand[MAXPATHLEN];
        size_t fsize;
-       spa_t *spa = za->za_spa;
        int bshift = SPA_MAXBLOCKSHIFT + 2;     /* don't scrog all labels */
        int iters = 1000;
-       int maxfaults = zopt_maxfaults;
+       int maxfaults;
+       int mirror_save;
        vdev_t *vd0 = NULL;
        uint64_t guid0 = 0;
+       boolean_t islog = B_FALSE;
+
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+       maxfaults = MAXFAULTS();
+       leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
+       mirror_save = zs->zs_mirrors;
+       VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 
        ASSERT(leaves >= 1);
 
@@ -3150,10 +4421,10 @@ ztest_fault_inject(ztest_args_t *za)
 
        if (ztest_random(2) == 0) {
                /*
-                * Inject errors on a normal data device.
+                * Inject errors on a normal data device or slog device.
                 */
-               top = ztest_random(spa->spa_root_vdev->vdev_children);
-               leaf = ztest_random(leaves);
+               top = ztest_random_vdev_top(spa, B_TRUE);
+               leaf = ztest_random(leaves) + zs->zs_splits;
 
                /*
                 * Generate paths to the first leaf in this top-level vdev,
@@ -3162,11 +4433,14 @@ ztest_fault_inject(ztest_args_t *za)
                 * and we'll write random garbage to the randomly chosen leaf.
                 */
                (void) snprintf(path0, sizeof (path0), ztest_dev_template,
-                   zopt_dir, zopt_pool, top * leaves + 0);
+                   zopt_dir, zopt_pool, top * leaves + zs->zs_splits);
                (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
                    zopt_dir, zopt_pool, top * leaves + leaf);
 
                vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+               if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+                       islog = B_TRUE;
+
                if (vd0 != NULL && maxfaults != 1) {
                        /*
                         * Make vd0 explicitly claim to be unreadable,
@@ -3212,22 +4486,38 @@ ztest_fault_inject(ztest_args_t *za)
 
        spa_config_exit(spa, SCL_STATE, FTAG);
 
-       if (maxfaults == 0)
-               return;
-
        /*
-        * If we can tolerate two or more faults, randomly online/offline vd0.
+        * If we can tolerate two or more faults, or we're dealing
+        * with a slog, randomly online/offline vd0.
         */
-       if (maxfaults >= 2 && guid0 != 0) {
+       if ((maxfaults >= 2 || islog) && guid0 != 0) {
                if (ztest_random(10) < 6) {
                        int flags = (ztest_random(2) == 0 ?
                            ZFS_OFFLINE_TEMPORARY : 0);
+
+                       /*
+                        * We have to grab the zs_name_lock as writer to
+                        * prevent a race between offlining a slog and
+                        * destroying a dataset. Offlining the slog will
+                        * grab a reference on the dataset which may cause
+                        * dmu_objset_destroy() to fail with EBUSY thus
+                        * leaving the dataset in an inconsistent state.
+                        */
+                       if (islog)
+                               (void) rw_wrlock(&ztest_shared->zs_name_lock);
+
                        VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+                       if (islog)
+                               (void) rw_unlock(&ztest_shared->zs_name_lock);
                } else {
                        (void) vdev_online(spa, guid0, 0, NULL);
                }
        }
 
+       if (maxfaults == 0)
+               return;
+
        /*
         * We have at least single-fault tolerance, so inject data corruption.
         */
@@ -3246,173 +4536,198 @@ ztest_fault_inject(ztest_args_t *za)
                if (offset >= fsize)
                        continue;
 
-               if (zopt_verbose >= 6)
-                       (void) printf("injecting bad word into %s,"
-                           " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+               VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+               if (mirror_save != zs->zs_mirrors) {
+                       VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+                       (void) close(fd);
+                       return;
+               }
 
                if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
                        fatal(1, "can't inject bad word at 0x%llx in %s",
                            offset, pathrand);
+
+               VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+
+               if (zopt_verbose >= 7)
+                       (void) printf("injected bad word into %s,"
+                           " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
        }
 
        (void) close(fd);
 }
 
 /*
- * Scrub the pool.
+ * Verify that DDT repair works as expected.
  */
 void
-ztest_scrub(ztest_args_t *za)
+ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 {
-       spa_t *spa = za->za_spa;
-
-       (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-       (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
-       (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-}
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       uint64_t object, blocksize, txg, pattern, psize;
+       enum zio_checksum checksum = spa_dedup_checksum(spa);
+       dmu_buf_t *db;
+       dmu_tx_t *tx;
+       void *buf;
+       blkptr_t blk;
+       int copies = 2 * ZIO_DEDUPDITTO_MIN;
 
-/*
- * Rename the pool to a different name and then rename it back.
- */
-void
-ztest_spa_rename(ztest_args_t *za)
-{
-       char *oldname, *newname;
-       int error;
-       spa_t *spa;
+       blocksize = ztest_random_blocksize();
+       blocksize = MIN(blocksize, 2048);       /* because we write so many */
 
-       (void) rw_wrlock(&ztest_shared->zs_name_lock);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 
-       oldname = za->za_pool;
-       newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
-       (void) strcpy(newname, oldname);
-       (void) strcat(newname, "_tmp");
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
 
        /*
-        * Do the rename
+        * Take the name lock as writer to prevent anyone else from changing
+        * the pool and dataset properies we need to maintain during this test.
         */
-       error = spa_rename(oldname, newname);
-       if (error)
-               fatal(0, "spa_rename('%s', '%s') = %d", oldname,
-                   newname, error);
+       (void) rw_wrlock(&zs->zs_name_lock);
 
-       /*
-        * Try to open it under the old name, which shouldn't exist
-        */
-       error = spa_open(oldname, &spa, FTAG);
-       if (error != ENOENT)
-               fatal(0, "spa_open('%s') = %d", oldname, error);
+       if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+           B_FALSE) != 0 ||
+           ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+           B_FALSE) != 0) {
+               (void) rw_unlock(&zs->zs_name_lock);
+               return;
+       }
+
+       object = od[0].od_object;
+       blocksize = od[0].od_blocksize;
+       pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+
+       ASSERT(object != 0);
+
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               (void) rw_unlock(&zs->zs_name_lock);
+               return;
+       }
 
        /*
-        * Open it under the new name and make sure it's still the same spa_t.
+        * Write all the copies of our block.
         */
-       error = spa_open(newname, &spa, FTAG);
-       if (error != 0)
-               fatal(0, "spa_open('%s') = %d", newname, error);
+       for (int i = 0; i < copies; i++) {
+               uint64_t offset = i * blocksize;
+               VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
+                   DMU_READ_NO_PREFETCH) == 0);
+               ASSERT(db->db_offset == offset);
+               ASSERT(db->db_size == blocksize);
+               ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+                   ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+               dmu_buf_will_fill(db, tx);
+               ztest_pattern_set(db->db_data, db->db_size, pattern);
+               dmu_buf_rele(db, FTAG);
+       }
 
-       ASSERT(spa == za->za_spa);
-       spa_close(spa, FTAG);
+       dmu_tx_commit(tx);
+       txg_wait_synced(spa_get_dsl(spa), txg);
 
        /*
-        * Rename it back to the original
+        * Find out what block we got.
         */
-       error = spa_rename(newname, oldname);
-       if (error)
-               fatal(0, "spa_rename('%s', '%s') = %d", newname,
-                   oldname, error);
+       VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
+           DMU_READ_NO_PREFETCH) == 0);
+       blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+       dmu_buf_rele(db, FTAG);
 
        /*
-        * Make sure it can still be opened
+        * Damage the block.  Dedup-ditto will save us when we read it later.
         */
-       error = spa_open(oldname, &spa, FTAG);
-       if (error != 0)
-               fatal(0, "spa_open('%s') = %d", oldname, error);
+       psize = BP_GET_PSIZE(&blk);
+       buf = zio_buf_alloc(psize);
+       ztest_pattern_set(buf, psize, ~pattern);
 
-       ASSERT(spa == za->za_spa);
-       spa_close(spa, FTAG);
+       (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+           buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
 
-       umem_free(newname, strlen(newname) + 1);
+       zio_buf_free(buf, psize);
 
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
+       (void) rw_unlock(&zs->zs_name_lock);
 }
 
-
 /*
- * Completely obliterate one disk.
+ * Scrub the pool.
  */
-static void
-ztest_obliterate_one_disk(uint64_t vdev)
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
-       int fd;
-       char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
-       size_t fsize;
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
 
-       if (zopt_maxfaults < 2)
-               return;
+       (void) spa_scan(spa, POOL_SCAN_SCRUB);
+       (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+       (void) spa_scan(spa, POOL_SCAN_SCRUB);
+}
 
-       (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
-       (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+/*
+ * Rename the pool to a different name and then rename it back.
+ */
+/* ARGSUSED */
+void
+ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       char *oldname, *newname;
+       spa_t *spa;
 
-       fd = open(dev_name, O_RDWR);
+       (void) rw_wrlock(&zs->zs_name_lock);
 
-       if (fd == -1)
-               fatal(1, "can't open %s", dev_name);
+       oldname = zs->zs_pool;
+       newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+       (void) strcpy(newname, oldname);
+       (void) strcat(newname, "_tmp");
 
        /*
-        * Determine the size.
+        * Do the rename
         */
-       fsize = lseek(fd, 0, SEEK_END);
-
-       (void) close(fd);
+       VERIFY3U(0, ==, spa_rename(oldname, newname));
 
        /*
-        * Rename the old device to dev_name.old (useful for debugging).
+        * Try to open it under the old name, which shouldn't exist
         */
-       VERIFY(rename(dev_name, copy_name) == 0);
+       VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
        /*
-        * Create a new one.
-        */
-       VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
-       VERIFY(ftruncate(fd, fsize) == 0);
-       (void) close(fd);
-}
-
-static void
-ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
-{
-       char dev_name[MAXPATHLEN];
-       nvlist_t *root;
-       int error;
-       uint64_t guid;
-       vdev_t *vd;
+        * Open it under the new name and make sure it's still the same spa_t.
+        */
+       VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 
-       (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+       ASSERT(spa == zs->zs_spa);
+       spa_close(spa, FTAG);
 
        /*
-        * Build the nvlist describing dev_name.
+        * Rename it back to the original
         */
-       root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
+       VERIFY3U(0, ==, spa_rename(newname, oldname));
 
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-       if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
-               guid = 0;
-       else
-               guid = vd->vdev_guid;
-       spa_config_exit(spa, SCL_VDEV, FTAG);
-       error = spa_vdev_attach(spa, guid, root, B_TRUE);
-       if (error != 0 &&
-           error != EBUSY &&
-           error != ENOTSUP &&
-           error != ENODEV &&
-           error != EDOM)
-               fatal(0, "spa_vdev_attach(in-place) = %d", error);
+       /*
+        * Make sure it can still be opened
+        */
+       VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
-       nvlist_free(root);
+       ASSERT(spa == zs->zs_spa);
+       spa_close(spa, FTAG);
+
+       umem_free(newname, strlen(newname) + 1);
+
+       (void) rw_unlock(&zs->zs_name_lock);
 }
 
+/*
+ * Verify pool integrity by running zdb.
+ */
 static void
-ztest_verify_blocks(char *pool)
+ztest_run_zdb(char *pool)
 {
        int status;
        char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@ -3433,11 +4748,12 @@ ztest_verify_blocks(char *pool)
        isa = strdup(isa);
        /* LINTED */
        (void) sprintf(bin,
-           "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
+           "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s",
            isalen,
            isa,
            zopt_verbose >= 3 ? "s" : "",
            zopt_verbose >= 4 ? "v" : "",
+           spa_config_path,
            pool);
        free(isa);
 
@@ -3483,7 +4799,6 @@ ztest_spa_import_export(char *oldname, char *newname)
        nvlist_t *config, *newconfig;
        uint64_t pool_guid;
        spa_t *spa;
-       int error;
 
        if (zopt_verbose >= 4) {
                (void) printf("import/export: old = %s, new = %s\n",
@@ -3498,15 +4813,13 @@ ztest_spa_import_export(char *oldname, char *newname)
        /*
         * Get the pool's configuration and guid.
         */
-       error = spa_open(oldname, &spa, FTAG);
-       if (error)
-               fatal(0, "spa_open('%s') = %d", oldname, error);
+       VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
        /*
         * Kick off a scrub to tickle scrub/export races.
         */
        if (ztest_random(2) == 0)
-               (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+               (void) spa_scan(spa, POOL_SCAN_SCRUB);
 
        pool_guid = spa_guid(spa);
        spa_close(spa, FTAG);
@@ -3516,9 +4829,7 @@ ztest_spa_import_export(char *oldname, char *newname)
        /*
         * Export it.
         */
-       error = spa_export(oldname, &config, B_FALSE, B_FALSE);
-       if (error)
-               fatal(0, "spa_export('%s') = %d", oldname, error);
+       VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
 
        ztest_walk_pool_directory("pools after export");
 
@@ -3532,39 +4843,29 @@ ztest_spa_import_export(char *oldname, char *newname)
        /*
         * Import it under the new name.
         */
-       error = spa_import(newname, config, NULL);
-       if (error)
-               fatal(0, "spa_import('%s') = %d", newname, error);
+       VERIFY3U(0, ==, spa_import(newname, config, NULL));
 
        ztest_walk_pool_directory("pools after import");
 
        /*
         * Try to import it again -- should fail with EEXIST.
         */
-       error = spa_import(newname, config, NULL);
-       if (error != EEXIST)
-               fatal(0, "spa_import('%s') twice", newname);
+       VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL));
 
        /*
         * Try to import it under a different name -- should fail with EEXIST.
         */
-       error = spa_import(oldname, config, NULL);
-       if (error != EEXIST)
-               fatal(0, "spa_import('%s') under multiple names", newname);
+       VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL));
 
        /*
         * Verify that the pool is no longer visible under the old name.
         */
-       error = spa_open(oldname, &spa, FTAG);
-       if (error != ENOENT)
-               fatal(0, "spa_open('%s') = %d", newname, error);
+       VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
        /*
         * Verify that we can open and close the pool using the new name.
         */
-       error = spa_open(newname, &spa, FTAG);
-       if (error)
-               fatal(0, "spa_open('%s') = %d", newname, error);
+       VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
        ASSERT(pool_guid == spa_guid(spa));
        spa_close(spa, FTAG);
 
@@ -3574,12 +4875,12 @@ ztest_spa_import_export(char *oldname, char *newname)
 static void
 ztest_resume(spa_t *spa)
 {
-       if (spa_suspended(spa)) {
-               spa_vdev_state_enter(spa);
-               vdev_clear(spa, NULL);
-               (void) spa_vdev_state_exit(spa, NULL, 0);
-               (void) zio_resume(spa);
-       }
+       if (spa_suspended(spa) && zopt_verbose >= 6)
+               (void) printf("resuming from suspended state\n");
+       spa_vdev_state_enter(spa, SCL_NONE);
+       vdev_clear(spa, NULL);
+       (void) spa_vdev_state_exit(spa, NULL, 0);
+       (void) zio_resume(spa);
 }
 
 static void *
@@ -3588,155 +4889,252 @@ ztest_resume_thread(void *arg)
        spa_t *spa = arg;
 
        while (!ztest_exiting) {
-               (void) poll(NULL, 0, 1000);
-               ztest_resume(spa);
+               if (spa_suspended(spa))
+                       ztest_resume(spa);
+               (void) poll(NULL, 0, 100);
        }
        return (NULL);
 }
 
 static void *
+ztest_deadman_thread(void *arg)
+{
+       ztest_shared_t *zs = arg;
+       int grace = 300;
+       hrtime_t delta;
+
+       delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+
+       (void) poll(NULL, 0, (int)(1000 * delta));
+
+       fatal(0, "failed to complete within %d seconds of deadline", grace);
+
+       return (NULL);
+}
+
+static void
+ztest_execute(ztest_info_t *zi, uint64_t id)
+{
+       ztest_shared_t *zs = ztest_shared;
+       ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+       hrtime_t functime = gethrtime();
+
+       for (int i = 0; i < zi->zi_iters; i++)
+               zi->zi_func(zd, id);
+
+       functime = gethrtime() - functime;
+
+       atomic_add_64(&zi->zi_call_count, 1);
+       atomic_add_64(&zi->zi_call_time, functime);
+
+       if (zopt_verbose >= 4) {
+               Dl_info dli;
+               (void) dladdr((void *)zi->zi_func, &dli);
+               (void) printf("%6.2f sec in %s\n",
+                   (double)functime / NANOSEC, dli.dli_sname);
+       }
+}
+
+static void *
 ztest_thread(void *arg)
 {
-       ztest_args_t *za = arg;
+       uint64_t id = (uintptr_t)arg;
        ztest_shared_t *zs = ztest_shared;
-       hrtime_t now, functime;
+       uint64_t call_next;
+       hrtime_t now;
        ztest_info_t *zi;
-       int f, i;
 
-       while ((now = gethrtime()) < za->za_stop) {
+       while ((now = gethrtime()) < zs->zs_thread_stop) {
                /*
                 * See if it's time to force a crash.
                 */
-               if (now > za->za_kill) {
-                       zs->zs_alloc = spa_get_alloc(za->za_spa);
-                       zs->zs_space = spa_get_space(za->za_spa);
-                       (void) kill(getpid(), SIGKILL);
-               }
+               if (now > zs->zs_thread_kill)
+                       ztest_kill(zs);
 
                /*
-                * Pick a random function.
+                * If we're getting ENOSPC with some regularity, stop.
                 */
-               f = ztest_random(ZTEST_FUNCS);
-               zi = &zs->zs_info[f];
+               if (zs->zs_enospc_count > 10)
+                       break;
 
                /*
-                * Decide whether to call it, based on the requested frequency.
+                * Pick a random function to execute.
                 */
-               if (zi->zi_call_target == 0 ||
-                   (double)zi->zi_call_total / zi->zi_call_target >
-                   (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
-                       continue;
+               zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+               call_next = zi->zi_call_next;
 
-               atomic_add_64(&zi->zi_calls, 1);
-               atomic_add_64(&zi->zi_call_total, 1);
+               if (now >= call_next &&
+                   atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+                   ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+                       ztest_execute(zi, id);
+       }
 
-               za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
-                   ZTEST_DIRSIZE;
-               za->za_diroff_shared = (1ULL << 63);
+       return (NULL);
+}
 
-               for (i = 0; i < zi->zi_iters; i++)
-                       zi->zi_func(za);
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+       (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+}
 
-               functime = gethrtime() - now;
+static void
+ztest_dataset_destroy(ztest_shared_t *zs, int d)
+{
+       char name[MAXNAMELEN];
 
-               atomic_add_64(&zi->zi_call_time, functime);
+       ztest_dataset_name(name, zs->zs_pool, d);
 
-               if (zopt_verbose >= 4) {
-                       Dl_info dli;
-                       (void) dladdr((void *)zi->zi_func, &dli);
-                       (void) printf("%6.2f sec in %s\n",
-                           (double)functime / NANOSEC, dli.dli_sname);
-               }
+       if (zopt_verbose >= 3)
+               (void) printf("Destroying %s to free up space\n", name);
 
-               /*
-                * If we're getting ENOSPC with some regularity, stop.
-                */
-               if (zs->zs_enospc_count > 10)
-                       break;
+       /*
+        * Cleanup any non-standard clones and snapshots.  In general,
+        * ztest thread t operates on dataset (t % zopt_datasets),
+        * so there may be more than one thing to clean up.
+        */
+       for (int t = d; t < zopt_threads; t += zopt_datasets)
+               ztest_dsl_dataset_cleanup(name, t);
+
+       (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+           DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+       uint64_t usedobjs, dirobjs, scratch;
+
+       /*
+        * ZTEST_DIROBJ is the object directory for the entire dataset.
+        * Therefore, the number of objects in use should equal the
+        * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+        * If not, we have an object leak.
+        *
+        * Note that we can only check this in ztest_dataset_open(),
+        * when the open-context and syncing-context values agree.
+        * That's because zap_count() returns the open-context value,
+        * while dmu_objset_space() returns the rootbp fill count.
+        */
+       VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+       dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+       ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(ztest_shared_t *zs, int d)
+{
+       ztest_ds_t *zd = &zs->zs_zd[d];
+       uint64_t committed_seq = zd->zd_seq;
+       objset_t *os;
+       zilog_t *zilog;
+       char name[MAXNAMELEN];
+       int error;
+
+       ztest_dataset_name(name, zs->zs_pool, d);
+
+       (void) rw_rdlock(&zs->zs_name_lock);
+
+       error = ztest_dataset_create(name);
+       if (error == ENOSPC) {
+               (void) rw_unlock(&zs->zs_name_lock);
+               ztest_record_enospc(FTAG);
+               return (error);
        }
+       ASSERT(error == 0 || error == EEXIST);
 
-       return (NULL);
+       VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+       (void) rw_unlock(&zs->zs_name_lock);
+
+       ztest_zd_init(zd, os);
+
+       zilog = zd->zd_zilog;
+
+       if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+           zilog->zl_header->zh_claim_lr_seq < committed_seq)
+               fatal(0, "missing log records: claimed %llu < committed %llu",
+                   zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+       ztest_dataset_dirobj_verify(zd);
+
+       zil_replay(os, zd, ztest_replay_vector);
+
+       ztest_dataset_dirobj_verify(zd);
+
+       if (zopt_verbose >= 6)
+               (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+                   zd->zd_name,
+                   (u_longlong_t)zilog->zl_parse_blk_count,
+                   (u_longlong_t)zilog->zl_parse_lr_count,
+                   (u_longlong_t)zilog->zl_replaying_seq);
+
+       zilog = zil_open(os, ztest_get_data);
+
+       if (zilog->zl_replaying_seq != 0 &&
+           zilog->zl_replaying_seq < committed_seq)
+               fatal(0, "missing log records: replayed %llu < committed %llu",
+                   zilog->zl_replaying_seq, committed_seq);
+
+       return (0);
+}
+
+static void
+ztest_dataset_close(ztest_shared_t *zs, int d)
+{
+       ztest_ds_t *zd = &zs->zs_zd[d];
+
+       zil_close(zd->zd_zilog);
+       dmu_objset_rele(zd->zd_os, zd);
+
+       ztest_zd_fini(zd);
 }
 
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
-ztest_run(char *pool)
+ztest_run(ztest_shared_t *zs)
 {
-       int t, d, error;
-       ztest_shared_t *zs = ztest_shared;
-       ztest_args_t *za;
+       thread_t *tid;
        spa_t *spa;
-       char name[100];
        thread_t resume_tid;
+       int error;
 
        ztest_exiting = B_FALSE;
 
-       (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
-       (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
-
-       for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
-               (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
-
-       /*
-        * Destroy one disk before we even start.
-        * It's mirrored, so everything should work just fine.
-        * This makes us exercise fault handling very early in spa_load().
-        */
-       ztest_obliterate_one_disk(0);
-
        /*
-        * Verify that the sum of the sizes of all blocks in the pool
-        * equals the SPA's allocated space total.
+        * Initialize parent/child shared state.
         */
-       ztest_verify_blocks(pool);
+       VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+       VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
 
-       /*
-        * Kick off a replacement of the disk we just obliterated.
-        */
-       kernel_init(FREAD | FWRITE);
-       VERIFY(spa_open(pool, &spa, FTAG) == 0);
-       ztest_replace_one_disk(spa, 0);
-       if (zopt_verbose >= 5)
-               show_pool_stats(spa);
-       spa_close(spa, FTAG);
-       kernel_fini();
-
-       kernel_init(FREAD | FWRITE);
+       zs->zs_thread_start = gethrtime();
+       zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
+       zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+       zs->zs_thread_kill = zs->zs_thread_stop;
+       if (ztest_random(100) < zopt_killrate)
+               zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
 
-       /*
-        * Verify that we can export the pool and reimport it under a
-        * different name.
-        */
-       if (ztest_random(2) == 0) {
-               (void) snprintf(name, 100, "%s_import", pool);
-               ztest_spa_import_export(pool, name);
-               ztest_spa_import_export(name, pool);
-       }
+       (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
 
-       /*
-        * Verify that we can loop over all pools.
-        */
-       mutex_enter(&spa_namespace_lock);
-       for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
-               if (zopt_verbose > 3) {
-                       (void) printf("spa_next: found %s\n", spa_name(spa));
-               }
-       }
-       mutex_exit(&spa_namespace_lock);
+       list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
+           offsetof(ztest_cb_data_t, zcd_node));
 
        /*
         * Open our pool.
         */
-       VERIFY(spa_open(pool, &spa, FTAG) == 0);
+       kernel_init(FREAD | FWRITE);
+       VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+       zs->zs_spa = spa;
+
+       spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
        /*
         * We don't expect the pool to suspend unless maxfaults == 0,
         * in which case ztest_fault_inject() temporarily takes away
         * the only valid replica.
         */
-       if (zopt_maxfaults == 0)
+       if (MAXFAULTS() == 0)
                spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
        else
                spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
@@ -3748,13 +5146,19 @@ ztest_run(char *pool)
            &resume_tid) == 0);
 
        /*
+        * Create a deadman thread to abort() if we hang.
+        */
+       VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
+           NULL) == 0);
+
+       /*
         * Verify that we can safely inquire about about any object,
         * whether it's allocated or not.  To make it interesting,
         * we probe a 5-wide window around each power of two.
         * This hits all edge cases, including zero and the max.
         */
-       for (t = 0; t < 64; t++) {
-               for (d = -5; d <= 5; d++) {
+       for (int t = 0; t < 64; t++) {
+               for (int d = -5; d <= 5; d++) {
                        error = dmu_object_info(spa->spa_meta_objset,
                            (1ULL << t) + d, NULL);
                        ASSERT(error == 0 || error == ENOENT ||
@@ -3763,121 +5167,157 @@ ztest_run(char *pool)
        }
 
        /*
-        * Now kick off all the tests that run in parallel.
+        * If we got any ENOSPC errors on the previous run, destroy something.
         */
+       if (zs->zs_enospc_count != 0) {
+               int d = ztest_random(zopt_datasets);
+               ztest_dataset_destroy(zs, d);
+       }
        zs->zs_enospc_count = 0;
 
-       za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+       tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL);
 
        if (zopt_verbose >= 4)
                (void) printf("starting main threads...\n");
 
-       za[0].za_start = gethrtime();
-       za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
-       za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
-       za[0].za_kill = za[0].za_stop;
-       if (ztest_random(100) < zopt_killrate)
-               za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
-
-       for (t = 0; t < zopt_threads; t++) {
-               d = t % zopt_datasets;
-
-               (void) strcpy(za[t].za_pool, pool);
-               za[t].za_os = za[d].za_os;
-               za[t].za_spa = spa;
-               za[t].za_zilog = za[d].za_zilog;
-               za[t].za_instance = t;
-               za[t].za_random = ztest_random(-1ULL);
-               za[t].za_start = za[0].za_start;
-               za[t].za_stop = za[0].za_stop;
-               za[t].za_kill = za[0].za_kill;
-
-               if (t < zopt_datasets) {
-                       int test_future = FALSE;
-                       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-                       (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-                       error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
-                           ztest_create_cb, NULL);
-                       if (error == EEXIST) {
-                               test_future = TRUE;
-                       } else if (error == ENOSPC) {
-                               zs->zs_enospc_count++;
-                               (void) rw_unlock(&ztest_shared->zs_name_lock);
-                               break;
-                       } else if (error != 0) {
-                               fatal(0, "dmu_objset_create(%s) = %d",
-                                   name, error);
-                       }
-                       error = dmu_objset_open(name, DMU_OST_OTHER,
-                           DS_MODE_USER, &za[d].za_os);
-                       if (error)
-                               fatal(0, "dmu_objset_open('%s') = %d",
-                                   name, error);
-                       (void) rw_unlock(&ztest_shared->zs_name_lock);
-                       if (test_future)
-                               ztest_dmu_check_future_leak(&za[t]);
-                       zil_replay(za[d].za_os, za[d].za_os,
-                           ztest_replay_vector);
-                       za[d].za_zilog = zil_open(za[d].za_os, NULL);
-               }
-
-               VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
-                   &za[t].za_thread) == 0);
+       /*
+        * Kick off all the tests that run in parallel.
+        */
+       for (int t = 0; t < zopt_threads; t++) {
+               if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+                       return;
+               VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+                   THR_BOUND, &tid[t]) == 0);
        }
 
-       while (--t >= 0) {
-               VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
-               if (t < zopt_datasets) {
-                       zil_close(za[t].za_zilog);
-                       dmu_objset_close(za[t].za_os);
-               }
+       /*
+        * Wait for all of the tests to complete.  We go in reverse order
+        * so we don't close datasets while threads are still using them.
+        */
+       for (int t = zopt_threads - 1; t >= 0; t--) {
+               VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+               if (t < zopt_datasets)
+                       ztest_dataset_close(zs, t);
        }
 
-       if (zopt_verbose >= 3)
-               show_pool_stats(spa);
-
        txg_wait_synced(spa_get_dsl(spa), 0);
 
-       zs->zs_alloc = spa_get_alloc(spa);
-       zs->zs_space = spa_get_space(spa);
+       zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+       zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+       umem_free(tid, zopt_threads * sizeof (thread_t));
+
+       /* Kill the resume thread */
+       ztest_exiting = B_TRUE;
+       VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+       ztest_resume(spa);
 
        /*
-        * If we had out-of-space errors, destroy a random objset.
+        * Right before closing the pool, kick off a bunch of async I/O;
+        * spa_close() should wait for it to complete.
         */
-       if (zs->zs_enospc_count != 0) {
-               (void) rw_rdlock(&ztest_shared->zs_name_lock);
-               d = (int)ztest_random(zopt_datasets);
-               (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-               if (zopt_verbose >= 3)
-                       (void) printf("Destroying %s to free up space\n", name);
+       for (uint64_t object = 1; object < 50; object++)
+               dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+
+       spa_close(spa, FTAG);
+
+       /*
+        * Verify that we can loop over all pools.
+        */
+       mutex_enter(&spa_namespace_lock);
+       for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+               if (zopt_verbose > 3)
+                       (void) printf("spa_next: found %s\n", spa_name(spa));
+       mutex_exit(&spa_namespace_lock);
+
+       /*
+        * Verify that we can export the pool and reimport it under a
+        * different name.
+        */
+       if (ztest_random(2) == 0) {
+               char name[MAXNAMELEN];
+               (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+               ztest_spa_import_export(zs->zs_pool, name);
+               ztest_spa_import_export(name, zs->zs_pool);
+       }
+
+       kernel_fini();
+}
+
+static void
+ztest_freeze(ztest_shared_t *zs)
+{
+       ztest_ds_t *zd = &zs->zs_zd[0];
+       spa_t *spa;
+       int numloops = 0;
+
+       if (zopt_verbose >= 3)
+               (void) printf("testing spa_freeze()...\n");
 
-               /* Cleanup any non-standard clones and snapshots */
-               ztest_dsl_dataset_cleanup(name, za[d].za_instance);
+       kernel_init(FREAD | FWRITE);
+       VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+       VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
 
-               (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
-                   DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
-               (void) rw_unlock(&ztest_shared->zs_name_lock);
+       /*
+        * Force the first log block to be transactionally allocated.
+        * We have to do this before we freeze the pool -- otherwise
+        * the log chain won't be anchored.
+        */
+       while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+               ztest_dmu_object_alloc_free(zd, 0);
+               zil_commit(zd->zd_zilog, UINT64_MAX, 0);
        }
 
        txg_wait_synced(spa_get_dsl(spa), 0);
 
-       umem_free(za, zopt_threads * sizeof (ztest_args_t));
+       /*
+        * Freeze the pool.  This stops spa_sync() from doing anything,
+        * so that the only way to record changes from now on is the ZIL.
+        */
+       spa_freeze(spa);
 
-       /* Kill the resume thread */
-       ztest_exiting = B_TRUE;
-       VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
-       ztest_resume(spa);
+       /*
+        * Run tests that generate log records but don't alter the pool config
+        * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+        * We do a txg_wait_synced() after each iteration to force the txg
+        * to increase well beyond the last synced value in the uberblock.
+        * The ZIL should be OK with that.
+        */
+       while (ztest_random(10) != 0 && numloops++ < zopt_maxloops) {
+               ztest_dmu_write_parallel(zd, 0);
+               ztest_dmu_object_alloc_free(zd, 0);
+               txg_wait_synced(spa_get_dsl(spa), 0);
+       }
 
        /*
-        * Right before closing the pool, kick off a bunch of async I/O;
-        * spa_close() should wait for it to complete.
+        * Commit all of the changes we just generated.
         */
-       for (t = 1; t < 50; t++)
-               dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+       zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+       txg_wait_synced(spa_get_dsl(spa), 0);
 
+       /*
+        * Close our dataset and close the pool.
+        */
+       ztest_dataset_close(zs, 0);
        spa_close(spa, FTAG);
+       kernel_fini();
 
+       /*
+        * Open and close the pool and dataset to induce log replay.
+        */
+       kernel_init(FREAD | FWRITE);
+       VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+       VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+       ztest_dataset_close(zs, 0);
+       spa_close(spa, FTAG);
        kernel_fini();
+
+       list_destroy(&zcl.zcl_callbacks);
+
+       (void) _mutex_destroy(&zcl.zcl_callbacks_lock);
+
+       (void) rwlock_destroy(&zs->zs_name_lock);
+       (void) _mutex_destroy(&zs->zs_vdev_lock);
 }
 
 void
@@ -3905,43 +5345,62 @@ print_time(hrtime_t t, char *timebuf)
                (void) sprintf(timebuf, "%llus", s);
 }
 
+static nvlist_t *
+make_random_props()
+{
+       nvlist_t *props;
+
+       if (ztest_random(2) == 0)
+               return (NULL);
+
+       VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+       VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
+
+       (void) printf("props:\n");
+       dump_nvlist(props, 4);
+
+       return (props);
+}
+
 /*
  * Create a storage pool with the given name and initial vdev size.
- * Then create the specified number of datasets in the pool.
+ * Then test spa_freeze() functionality.
  */
 static void
-ztest_init(char *pool)
+ztest_init(ztest_shared_t *zs)
 {
        spa_t *spa;
-       int error;
-       nvlist_t *nvroot;
+       nvlist_t *nvroot, *props;
+
+       VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+       VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
 
        kernel_init(FREAD | FWRITE);
 
        /*
         * Create the storage pool.
         */
-       (void) spa_destroy(pool);
-       ztest_shared->zs_vdev_primaries = 0;
+       (void) spa_destroy(zs->zs_pool);
+       ztest_shared->zs_vdev_next_leaf = 0;
+       zs->zs_splits = 0;
+       zs->zs_mirrors = zopt_mirrors;
        nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-           0, zopt_raidz, zopt_mirrors, 1);
-       error = spa_create(pool, nvroot, NULL, NULL, NULL);
+           0, zopt_raidz, zs->zs_mirrors, 1);
+       props = make_random_props();
+       VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL));
        nvlist_free(nvroot);
 
-       if (error)
-               fatal(0, "spa_create() = %d", error);
-       error = spa_open(pool, &spa, FTAG);
-       if (error)
-               fatal(0, "spa_open() = %d", error);
-
+       VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
        metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
-
-       if (zopt_verbose >= 3)
-               show_pool_stats(spa);
-
        spa_close(spa, FTAG);
 
        kernel_fini();
+
+       ztest_run_zdb(zs->zs_pool);
+
+       ztest_freeze(zs);
+
+       ztest_run_zdb(zs->zs_pool);
 }
 
 int
@@ -3949,29 +5408,32 @@ main(int argc, char **argv)
 {
        int kills = 0;
        int iters = 0;
-       int i, f;
        ztest_shared_t *zs;
+       size_t shared_size;
        ztest_info_t *zi;
        char timebuf[100];
        char numbuf[6];
+       spa_t *spa;
 
        (void) setvbuf(stdout, NULL, _IOLBF, 0);
 
-       /* Override location of zpool.cache */
-       spa_config_path = "/tmp/zpool.cache";
-
        ztest_random_fd = open("/dev/urandom", O_RDONLY);
 
        process_options(argc, argv);
 
+       /* Override location of zpool.cache */
+       (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", zopt_dir);
+
        /*
         * Blow away any existing copy of zpool.cache
         */
        if (zopt_init != 0)
-               (void) remove("/tmp/zpool.cache");
+               (void) remove(spa_config_path);
+
+       shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
 
        zs = ztest_shared = (void *)mmap(0,
-           P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+           P2ROUNDUP(shared_size, getpagesize()),
            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
 
        if (zopt_verbose >= 1) {
@@ -3984,46 +5446,43 @@ main(int argc, char **argv)
        /*
         * Create and initialize our storage pool.
         */
-       for (i = 1; i <= zopt_init; i++) {
+       for (int i = 1; i <= zopt_init; i++) {
                bzero(zs, sizeof (ztest_shared_t));
                if (zopt_verbose >= 3 && zopt_init != 1)
                        (void) printf("ztest_init(), pass %d\n", i);
-               ztest_init(zopt_pool);
+               zs->zs_pool = zopt_pool;
+               ztest_init(zs);
        }
 
-       /*
-        * Initialize the call targets for each function.
-        */
-       for (f = 0; f < ZTEST_FUNCS; f++) {
-               zi = &zs->zs_info[f];
+       zs->zs_pool = zopt_pool;
+       zs->zs_proc_start = gethrtime();
+       zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
 
+       for (int f = 0; f < ZTEST_FUNCS; f++) {
+               zi = &zs->zs_info[f];
                *zi = ztest_info[f];
-
-               if (*zi->zi_interval == 0)
-                       zi->zi_call_target = UINT64_MAX;
+               if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+                       zi->zi_call_next = UINT64_MAX;
                else
-                       zi->zi_call_target = zopt_time / *zi->zi_interval;
+                       zi->zi_call_next = zs->zs_proc_start +
+                           ztest_random(2 * zi->zi_interval[0] + 1);
        }
 
-       zs->zs_start_time = gethrtime();
-       zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
-
        /*
         * Run the tests in a loop.  These tests include fault injection
         * to verify that self-healing data works, and forced crashes
         * to verify that we never lose on-disk consistency.
         */
-       while (gethrtime() < zs->zs_stop_time) {
+       while (gethrtime() < zs->zs_proc_stop) {
                int status;
                pid_t pid;
-               char *tmp;
 
                /*
                 * Initialize the workload counters for each function.
                 */
-               for (f = 0; f < ZTEST_FUNCS; f++) {
+               for (int f = 0; f < ZTEST_FUNCS; f++) {
                        zi = &zs->zs_info[f];
-                       zi->zi_calls = 0;
+                       zi->zi_call_count = 0;
                        zi->zi_call_time = 0;
                }
 
@@ -4039,7 +5498,7 @@ main(int argc, char **argv)
                        struct rlimit rl = { 1024, 1024 };
                        (void) setrlimit(RLIMIT_NOFILE, &rl);
                        (void) enable_extended_FILE_stdio(-1, -1);
-                       ztest_run(zopt_pool);
+                       ztest_run(zs);
                        exit(0);
                }
 
@@ -4072,8 +5531,8 @@ main(int argc, char **argv)
                if (zopt_verbose >= 1) {
                        hrtime_t now = gethrtime();
 
-                       now = MIN(now, zs->zs_stop_time);
-                       print_time(zs->zs_stop_time - now, timebuf);
+                       now = MIN(now, zs->zs_proc_stop);
+                       print_time(zs->zs_proc_stop - now, timebuf);
                        nicenum(zs->zs_space, numbuf);
 
                        (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
@@ -4083,7 +5542,7 @@ main(int argc, char **argv)
                            (u_longlong_t)zs->zs_enospc_count,
                            100.0 * zs->zs_alloc / zs->zs_space,
                            numbuf,
-                           100.0 * (now - zs->zs_start_time) /
+                           100.0 * (now - zs->zs_proc_start) /
                            (zopt_time * NANOSEC), timebuf);
                }
 
@@ -4093,34 +5552,39 @@ main(int argc, char **argv)
                            "Calls", "Time", "Function");
                        (void) printf("%7s %9s   %s\n",
                            "-----", "----", "--------");
-                       for (f = 0; f < ZTEST_FUNCS; f++) {
+                       for (int f = 0; f < ZTEST_FUNCS; f++) {
                                Dl_info dli;
 
                                zi = &zs->zs_info[f];
                                print_time(zi->zi_call_time, timebuf);
                                (void) dladdr((void *)zi->zi_func, &dli);
                                (void) printf("%7llu %9s   %s\n",
-                                   (u_longlong_t)zi->zi_calls, timebuf,
+                                   (u_longlong_t)zi->zi_call_count, timebuf,
                                    dli.dli_sname);
                        }
                        (void) printf("\n");
                }
 
                /*
-                * It's possible that we killed a child during a rename test, in
-                * which case we'll have a 'ztest_tmp' pool lying around instead
-                * of 'ztest'.  Do a blind rename in case this happened.
+                * It's possible that we killed a child during a rename test,
+                * in which case we'll have a 'ztest_tmp' pool lying around
+                * instead of 'ztest'.  Do a blind rename in case this happened.
                 */
-               tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
-               (void) strcpy(tmp, zopt_pool);
-               (void) strcat(tmp, "_tmp");
-               kernel_init(FREAD | FWRITE);
-               (void) spa_rename(tmp, zopt_pool);
+               kernel_init(FREAD);
+               if (spa_open(zopt_pool, &spa, FTAG) == 0) {
+                       spa_close(spa, FTAG);
+               } else {
+                       char tmpname[MAXNAMELEN];
+                       kernel_fini();
+                       kernel_init(FREAD | FWRITE);
+                       (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
+                           zopt_pool);
+                       (void) spa_rename(tmpname, zopt_pool);
+               }
                kernel_fini();
-               umem_free(tmp, strlen(tmp) + 1);
-       }
 
-       ztest_verify_blocks(zopt_pool);
+               ztest_run_zdb(zopt_pool);
+       }
 
        if (zopt_verbose >= 1) {
                (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
index e655e0d..15c1c78 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef        _LIBNVPAIR_H
 #define        _LIBNVPAIR_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/nvpair.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -40,6 +38,7 @@ extern "C" {
 void nvlist_print(FILE *, nvlist_t *);
 int nvpair_value_match(nvpair_t *, int, char *, char **);
 int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
+void dump_nvlist(nvlist_t *, int);
 
 #ifdef __cplusplus
 }
index 0845cb0..57915cd 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <unistd.h>
 #include <strings.h>
+#include <libintl.h>
 #include <sys/types.h>
 #include <sys/inttypes.h>
 #include "libnvpair.h"
@@ -272,6 +271,156 @@ nvlist_print(FILE *fp, nvlist_t *nvl)
        nvlist_print_with_indent(fp, nvl, 0);
 }
 
+
+#define        NVP(elem, type, vtype, ptype, format) { \
+       vtype   value; \
+\
+       (void) nvpair_value_##type(elem, &value); \
+       (void) printf("%*s%s: " format "\n", indent, "", \
+           nvpair_name(elem), (ptype)value); \
+}
+
+#define        NVPA(elem, type, vtype, ptype, format) { \
+       uint_t  i, count; \
+       vtype   *value;  \
+\
+       (void) nvpair_value_##type(elem, &value, &count); \
+       for (i = 0; i < count; i++) { \
+               (void) printf("%*s%s[%d]: " format "\n", indent, "", \
+                   nvpair_name(elem), i, (ptype)value[i]); \
+       } \
+}
+
+/*
+ * Similar to nvlist_print() but handles arrays slightly differently.
+ */
+void
+dump_nvlist(nvlist_t *list, int indent)
+{
+       nvpair_t        *elem = NULL;
+       boolean_t       bool_value;
+       nvlist_t        *nvlist_value;
+       nvlist_t        **nvlist_array_value;
+       uint_t          i, count;
+
+       if (list == NULL) {
+               return;
+       }
+
+       while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+               switch (nvpair_type(elem)) {
+               case DATA_TYPE_BOOLEAN_VALUE:
+                       (void) nvpair_value_boolean_value(elem, &bool_value);
+                       (void) printf("%*s%s: %s\n", indent, "",
+                           nvpair_name(elem), bool_value ? "true" : "false");
+                       break;
+
+               case DATA_TYPE_BYTE:
+                       NVP(elem, byte, uchar_t, int, "%u");
+                       break;
+
+               case DATA_TYPE_INT8:
+                       NVP(elem, int8, int8_t, int, "%d");
+                       break;
+
+               case DATA_TYPE_UINT8:
+                       NVP(elem, uint8, uint8_t, int, "%u");
+                       break;
+
+               case DATA_TYPE_INT16:
+                       NVP(elem, int16, int16_t, int, "%d");
+                       break;
+
+               case DATA_TYPE_UINT16:
+                       NVP(elem, uint16, uint16_t, int, "%u");
+                       break;
+
+               case DATA_TYPE_INT32:
+                       NVP(elem, int32, int32_t, long, "%ld");
+                       break;
+
+               case DATA_TYPE_UINT32:
+                       NVP(elem, uint32, uint32_t, ulong_t, "%lu");
+                       break;
+
+               case DATA_TYPE_INT64:
+                       NVP(elem, int64, int64_t, longlong_t, "%lld");
+                       break;
+
+               case DATA_TYPE_UINT64:
+                       NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
+                       break;
+
+               case DATA_TYPE_STRING:
+                       NVP(elem, string, char *, char *, "'%s'");
+                       break;
+
+               case DATA_TYPE_BYTE_ARRAY:
+                       NVPA(elem, byte_array, uchar_t, int, "%u");
+                       break;
+
+               case DATA_TYPE_INT8_ARRAY:
+                       NVPA(elem, int8_array, int8_t, int, "%d");
+                       break;
+
+               case DATA_TYPE_UINT8_ARRAY:
+                       NVPA(elem, uint8_array, uint8_t, int, "%u");
+                       break;
+
+               case DATA_TYPE_INT16_ARRAY:
+                       NVPA(elem, int16_array, int16_t, int, "%d");
+                       break;
+
+               case DATA_TYPE_UINT16_ARRAY:
+                       NVPA(elem, uint16_array, uint16_t, int, "%u");
+                       break;
+
+               case DATA_TYPE_INT32_ARRAY:
+                       NVPA(elem, int32_array, int32_t, long, "%ld");
+                       break;
+
+               case DATA_TYPE_UINT32_ARRAY:
+                       NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
+                       break;
+
+               case DATA_TYPE_INT64_ARRAY:
+                       NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
+                       break;
+
+               case DATA_TYPE_UINT64_ARRAY:
+                       NVPA(elem, uint64_array, uint64_t, u_longlong_t,
+                           "%llu");
+                       break;
+
+               case DATA_TYPE_STRING_ARRAY:
+                       NVPA(elem, string_array, char *, char *, "'%s'");
+                       break;
+
+               case DATA_TYPE_NVLIST:
+                       (void) nvpair_value_nvlist(elem, &nvlist_value);
+                       (void) printf("%*s%s:\n", indent, "",
+                           nvpair_name(elem));
+                       dump_nvlist(nvlist_value, indent + 4);
+                       break;
+
+               case DATA_TYPE_NVLIST_ARRAY:
+                       (void) nvpair_value_nvlist_array(elem,
+                           &nvlist_array_value, &count);
+                       for (i = 0; i < count; i++) {
+                               (void) printf("%*s%s[%u]:\n", indent, "",
+                                   nvpair_name(elem), i);
+                               dump_nvlist(nvlist_array_value[i], indent + 4);
+                       }
+                       break;
+
+               default:
+                       (void) printf(dgettext(TEXT_DOMAIN, "bad config type "
+                           "%d for %s\n"), nvpair_type(elem),
+                           nvpair_name(elem));
+               }
+       }
+}
+
 /*
  * Determine if string 'value' matches 'nvp' value.  The 'value' string is
  * converted, depending on the type of 'nvp', prior to match.  For numeric
index f19e398..6f7fed6 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _LIBZFS_H
@@ -66,7 +65,6 @@ enum {
        EZFS_BADSTREAM,         /* bad backup stream */
        EZFS_DSREADONLY,        /* dataset is readonly */
        EZFS_VOLTOOBIG,         /* volume is too large for 32-bit system */
-       EZFS_VOLHASDATA,        /* volume already contains data */
        EZFS_INVALIDNAME,       /* invalid dataset name */
        EZFS_BADRESTORE,        /* unable to restore to destination */
        EZFS_BADBACKUP,         /* backup failed */
@@ -85,17 +83,15 @@ enum {
        EZFS_UMOUNTFAILED,      /* failed to unmount dataset */
        EZFS_UNSHARENFSFAILED,  /* unshare(1M) failed */
        EZFS_SHARENFSFAILED,    /* share(1M) failed */
-       EZFS_DEVLINKS,          /* failed to create zvol links */
        EZFS_PERM,              /* permission denied */
        EZFS_NOSPC,             /* out of space */
+       EZFS_FAULT,             /* bad address */
        EZFS_IO,                /* I/O error */
        EZFS_INTR,              /* signal received */
        EZFS_ISSPARE,           /* device is a hot spare */
        EZFS_INVALCONFIG,       /* invalid vdev configuration */
        EZFS_RECURSIVE,         /* recursive dependency */
        EZFS_NOHISTORY,         /* no history object */
-       EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */
-       EZFS_SHAREISCSIFAILED,  /* iscsitgtd failed request to share */
        EZFS_POOLPROPS,         /* couldn't retrieve pool props */
        EZFS_POOL_NOTSUP,       /* ops not supported for this type of pool */
        EZFS_POOL_INVALARG,     /* invalid argument for this pool operation */
@@ -103,7 +99,6 @@ enum {
        EZFS_OPENFAILED,        /* open of device failed */
        EZFS_NOCAP,             /* couldn't get capacity */
        EZFS_LABELFAILED,       /* write of label failed */
-       EZFS_ISCSISVCUNAVAIL,   /* iscsi service unavailable */
        EZFS_BADWHO,            /* invalid permission who */
        EZFS_BADPERM,           /* invalid permission */
        EZFS_BADPERMSET,        /* invalid permission set name */
@@ -119,6 +114,12 @@ enum {
        EZFS_UNPLAYED_LOGS,     /* log device has unplayed logs */
        EZFS_REFTAG_RELE,       /* snapshot release: tag not found */
        EZFS_REFTAG_HOLD,       /* snapshot hold: tag already exists */
+       EZFS_TAGTOOLONG,        /* snapshot hold/rele: tag too long */
+       EZFS_PIPEFAILED,        /* pipe create failed */
+       EZFS_THREADCREATEFAILED, /* thread create failed */
+       EZFS_POSTSPLIT_ONLINE,  /* onlining a disk after splitting it */
+       EZFS_SCRUBBING,         /* currently scrubbing */
+       EZFS_NO_SCRUB,          /* no active scrub */
        EZFS_UNKNOWN
 };
 
@@ -213,11 +214,19 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
 extern int zpool_destroy(zpool_handle_t *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);
 
+typedef struct splitflags {
+       /* do not split, but return the config that would be split off */
+       int dryrun : 1;
+
+       /* after splitting, import the pool */
+       int import : 1;
+} splitflags_t;
+
 /*
  * Functions to manipulate pool and vdev state
  */
-extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
-extern int zpool_clear(zpool_handle_t *, const char *);
+extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
+extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
@@ -226,9 +235,11 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *,
     const char *, nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
+extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
+    splitflags_t);
 
-extern int zpool_vdev_fault(zpool_handle_t *, uint64_t);
-extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t);
+extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
+extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
 extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
 
 extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
@@ -298,6 +309,7 @@ typedef enum {
 
 extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
 extern zpool_status_t zpool_import_status(nvlist_t *, char **);
+extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
 
 /*
  * Statistics and configuration functions.
@@ -319,23 +331,38 @@ extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
 /*
  * Search for pools to import
  */
+
+typedef struct importargs {
+       char **path;            /* a list of paths to search            */
+       int paths;              /* number of paths to search            */
+       char *poolname;         /* name of a pool to find               */
+       uint64_t guid;          /* guid of a pool to find               */
+       char *cachefile;        /* cachefile to use for import          */
+       int can_be_active : 1;  /* can the pool be active?              */
+       int unique : 1;         /* does 'poolname' already exist?       */
+       int exists : 1;         /* set on return if pool already exists */
+} importargs_t;
+
+extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
+
+/* legacy pool search routines */
 extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
     char *, uint64_t);
-extern nvlist_t *zpool_find_import_byname(libzfs_handle_t *, int, char **,
-    char *);
-extern nvlist_t *zpool_find_import_byguid(libzfs_handle_t *, int, char **,
-    uint64_t);
-extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **);
 
 /*
  * Miscellaneous pool functions
  */
 struct zfs_cmd;
 
-extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
+extern const char *zfs_history_event_names[LOG_END];
+
+extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
+    boolean_t verbose);
 extern int zpool_upgrade(zpool_handle_t *, uint64_t);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+    nvlist_t ***, uint_t *);
 extern void zpool_set_history_str(const char *subcommand, int argc,
     char **argv, char *history_str);
 extern int zpool_stage_history(libzfs_handle_t *, const char *);
@@ -343,6 +370,8 @@ extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
     size_t len);
 extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
+extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
+    nvlist_t *);
 
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
@@ -374,6 +403,8 @@ extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
     zprop_source_t *, char *, size_t, boolean_t);
+extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
+    boolean_t);
 extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
     zprop_source_t *, char *, size_t);
 extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
@@ -381,10 +412,11 @@ extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
 extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
-extern int zfs_prop_inherit(zfs_handle_t *, const char *);
+extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
 
 typedef struct zprop_list {
        int             pl_prop;
@@ -392,10 +424,11 @@ typedef struct zprop_list {
        struct zprop_list *pl_next;
        boolean_t       pl_all;
        size_t          pl_width;
+       size_t          pl_recvd_width;
        boolean_t       pl_fixed;
 } zprop_list_t;
 
-extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **);
+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t);
 extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
 
 #define        ZFS_MOUNTPOINT_NONE     "none"
@@ -419,13 +452,24 @@ extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
     zfs_type_t);
 extern void zprop_free_list(zprop_list_t *);
 
+#define        ZFS_GET_NCOLS   5
+
+typedef enum {
+       GET_COL_NONE,
+       GET_COL_NAME,
+       GET_COL_PROPERTY,
+       GET_COL_VALUE,
+       GET_COL_RECVD,
+       GET_COL_SOURCE
+} zfs_get_column_t;
+
 /*
  * Functions for printing zfs or zpool properties
  */
 typedef struct zprop_get_cbdata {
        int cb_sources;
-       int cb_columns[4];
-       int cb_colwidths[5];
+       zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
+       int cb_colwidths[ZFS_GET_NCOLS + 1];
        boolean_t cb_scripted;
        boolean_t cb_literal;
        boolean_t cb_first;
@@ -434,12 +478,8 @@ typedef struct zprop_get_cbdata {
 } zprop_get_cbdata_t;
 
 void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
-    const char *, const char *, zprop_source_t, const char *);
-
-#define        GET_COL_NAME            1
-#define        GET_COL_PROPERTY        2
-#define        GET_COL_VALUE           3
-#define        GET_COL_SOURCE          4
+    const char *, const char *, zprop_source_t, const char *,
+    const char *);
 
 /*
  * Iterator functions.
@@ -450,6 +490,7 @@ extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
 
 /*
  * Functions to create and destroy datasets.
@@ -463,11 +504,42 @@ extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
 extern int zfs_rename(zfs_handle_t *, const char *, boolean_t);
-extern int zfs_send(zfs_handle_t *, const char *, const char *,
-    boolean_t, boolean_t, boolean_t, boolean_t, int);
+
+typedef struct sendflags {
+       /* print informational messages (ie, -v was specified) */
+       int verbose : 1;
+
+       /* recursive send  (ie, -R) */
+       int replicate : 1;
+
+       /* for incrementals, do all intermediate snapshots */
+       int doall : 1; /* (ie, -I) */
+
+       /* if dataset is a clone, do incremental from its origin */
+       int fromorigin : 1;
+
+       /* do deduplication */
+       int dedup : 1;
+
+       /* send properties (ie, -p) */
+       int props : 1;
+} sendflags_t;
+
+typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
+
+extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+    sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+    void *cb_arg, nvlist_t **debugnvp);
+
 extern int zfs_promote(zfs_handle_t *);
-extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
+    boolean_t, boolean_t);
+extern int zfs_hold_range(zfs_handle_t *, const char *, const char *,
+    const char *, boolean_t, boolean_t, snapfilter_cb_t, void *);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_release_range(zfs_handle_t *, const char *, const char *,
+    const char *, boolean_t);
+extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
 
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
     uid_t rid, uint64_t space);
@@ -482,6 +554,12 @@ typedef struct recvflags {
        /* the destination is a prefix, not the exact fs (ie, -d) */
        int isprefix : 1;
 
+       /*
+        * Only the tail of the sent snapshot path is appended to the
+        * destination to determine the received snapshot name (ie, -e).
+        */
+       int istail : 1;
+
        /* do not actually do the recv, just check if it would work (ie, -n) */
        int dryrun : 1;
 
@@ -542,10 +620,6 @@ extern int zfs_unshareall_nfs(zfs_handle_t *);
 extern int zfs_unshareall_smb(zfs_handle_t *);
 extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
 extern int zfs_unshareall(zfs_handle_t *);
-extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
-extern int zfs_share_iscsi(zfs_handle_t *);
-extern int zfs_unshare_iscsi(zfs_handle_t *);
-extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *);
 extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
     void *, void *, int, zfs_share_op_t);
 
@@ -571,15 +645,10 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
     boolean_t *);
 
 /*
- * ftyp special.  Read the label from a given device.
+ * Label manipulation.
  */
 extern int zpool_read_label(int, nvlist_t **);
-
-/*
- * Create and remove zvol /dev links.
- */
-extern int zpool_create_zvol_links(zpool_handle_t *);
-extern int zpool_remove_zvol_links(zpool_handle_t *);
+extern int zpool_clear_label(int);
 
 /* is this zvol valid for use as a dump device? */
 extern int zvol_check_dump_config(char *);
@@ -600,6 +669,17 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
 
+/*
+ * Mappings between vdev and FRU.
+ */
+extern void libzfs_fru_refresh(libzfs_handle_t *);
+extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
+extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
+extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
+    const char *);
+extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
+extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
+
 #ifdef __cplusplus
 }
 #endif
index 70a1d1c..89c48c1 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,7 +30,6 @@
 #include <sys/dmu.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/zfs_acl.h>
 #include <sys/spa.h>
 #include <sys/nvpair.h>
 
@@ -38,6 +37,8 @@
 #include <libzfs.h>
 #include <libshare.h>
 
+#include <fm/libtopo.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -47,6 +48,13 @@ extern "C" {
 #endif
 #define        VERIFY  verify
 
+typedef struct libzfs_fru {
+       char *zf_device;
+       char *zf_fru;
+       struct libzfs_fru *zf_chain;
+       struct libzfs_fru *zf_next;
+} libzfs_fru_t;
+
 struct libzfs_handle {
        int libzfs_error;
        int libzfs_fd;
@@ -65,7 +73,13 @@ struct libzfs_handle {
        uint_t libzfs_shareflags;
        boolean_t libzfs_mnttab_enable;
        avl_tree_t libzfs_mnttab_cache;
+       int libzfs_pool_iter;
+       topo_hdl_t *libzfs_topo_hdl;
+       libzfs_fru_t **libzfs_fru_hash;
+       libzfs_fru_t *libzfs_fru_list;
+       char libzfs_chassis_id[256];
 };
+
 #define        ZFSSHARE_MISS   0x01    /* Didn't find entry in cache */
 
 struct zfs_handle {
@@ -77,6 +91,7 @@ struct zfs_handle {
        dmu_objset_stats_t zfs_dmustats;
        nvlist_t *zfs_props;
        nvlist_t *zfs_user_props;
+       nvlist_t *zfs_recvd_props;
        boolean_t zfs_mntcheck;
        char *zfs_mntopts;
        uint8_t *zfs_props_table;
@@ -112,7 +127,6 @@ typedef  enum {
  */
 typedef enum {
        SHARED_NOT_SHARED = 0x0,
-       SHARED_ISCSI = 0x1,
        SHARED_NFS = 0x2,
        SHARED_SMB = 0x4
 } zfs_share_type_t;
@@ -172,9 +186,6 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);
 
 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
-int zvol_create_link(libzfs_handle_t *, const char *);
-int zvol_remove_link(libzfs_handle_t *, const char *);
-int zpool_iter_zvol(zpool_handle_t *, int (*)(const char *, void *), void *);
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
 void namespace_clear(libzfs_handle_t *);
@@ -189,6 +200,9 @@ extern int zfs_parse_options(char *, zfs_share_proto_t);
 
 extern int zfs_unshare_proto(zfs_handle_t *,
     const char *, zfs_share_proto_t *);
+
+extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t);
+
 #ifdef __cplusplus
 }
 #endif
index ff438b3..4328d38 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  * Portions Copyright 2007 Ramprakash Jelari
@@ -116,32 +116,7 @@ changelist_prefix(prop_changelist_t *clp)
                if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
                        continue;
 
-               if (ZFS_IS_VOLUME(cn->cn_handle)) {
-                       switch (clp->cl_realprop) {
-                       case ZFS_PROP_NAME:
-                               /*
-                                * If this was a rename, unshare the zvol, and
-                                * remove the /dev/zvol links.
-                                */
-                               (void) zfs_unshare_iscsi(cn->cn_handle);
-
-                               if (zvol_remove_link(cn->cn_handle->zfs_hdl,
-                                   cn->cn_handle->zfs_name) != 0) {
-                                       ret = -1;
-                                       cn->cn_needpost = B_FALSE;
-                                       (void) zfs_share_iscsi(cn->cn_handle);
-                               }
-                               break;
-
-                       case ZFS_PROP_VOLSIZE:
-                               /*
-                                * If this was a change to the volume size, we
-                                * need to unshare and reshare the volume.
-                                */
-                               (void) zfs_unshare_iscsi(cn->cn_handle);
-                               break;
-                       }
-               } else {
+               if (!ZFS_IS_VOLUME(cn->cn_handle)) {
                        /*
                         * Do the property specific processing.
                         */
@@ -234,32 +209,8 @@ changelist_postfix(prop_changelist_t *clp)
 
                zfs_refresh_properties(cn->cn_handle);
 
-               if (ZFS_IS_VOLUME(cn->cn_handle)) {
-                       /*
-                        * If we're doing a rename, recreate the /dev/zvol
-                        * links.
-                        */
-                       if (clp->cl_realprop == ZFS_PROP_NAME &&
-                           zvol_create_link(cn->cn_handle->zfs_hdl,
-                           cn->cn_handle->zfs_name) != 0) {
-                               errors++;
-                       } else if (cn->cn_shared ||
-                           clp->cl_prop == ZFS_PROP_SHAREISCSI) {
-                               if (zfs_prop_get(cn->cn_handle,
-                                   ZFS_PROP_SHAREISCSI, shareopts,
-                                   sizeof (shareopts), NULL, NULL, 0,
-                                   B_FALSE) == 0 &&
-                                   strcmp(shareopts, "off") == 0) {
-                                       errors +=
-                                           zfs_unshare_iscsi(cn->cn_handle);
-                               } else {
-                                       errors +=
-                                           zfs_share_iscsi(cn->cn_handle);
-                               }
-                       }
-
+               if (ZFS_IS_VOLUME(cn->cn_handle))
                        continue;
-               }
 
                /*
                 * Remount if previously mounted or mountpoint was legacy,
@@ -658,8 +609,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 
        if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
            clp->cl_prop != ZFS_PROP_SHARENFS &&
-           clp->cl_prop != ZFS_PROP_SHARESMB &&
-           clp->cl_prop != ZFS_PROP_SHAREISCSI)
+           clp->cl_prop != ZFS_PROP_SHARESMB)
                return (clp);
 
        /*
index 94640d1..dc27238 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 /*
  * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
  * single packed nvlist.  While it would be nice to just read in this
@@ -313,21 +311,33 @@ zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data)
        zpool_handle_t *zhp;
        int ret;
 
-       if (namespace_reload(hdl) != 0)
+       /*
+        * If someone makes a recursive call to zpool_iter(), we want to avoid
+        * refreshing the namespace because that will invalidate the parent
+        * context.  We allow recursive calls, but simply re-use the same
+        * namespace AVL tree.
+        */
+       if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0)
                return (-1);
 
+       hdl->libzfs_pool_iter++;
        for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
            cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
 
-               if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0)
+               if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) {
+                       hdl->libzfs_pool_iter--;
                        return (-1);
+               }
 
                if (zhp == NULL)
                        continue;
 
-               if ((ret = func(zhp, data)) != 0)
+               if ((ret = func(zhp, data)) != 0) {
+                       hdl->libzfs_pool_iter--;
                        return (ret);
+               }
        }
+       hdl->libzfs_pool_iter--;
 
        return (0);
 }
index ab9ba6b..a3f5a7d 100644 (file)
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <libdevinfo.h>
 #include <libintl.h>
 #include <math.h>
 #include <stdio.h>
@@ -39,7 +37,6 @@
 #include <fcntl.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
-#include <sys/avl.h>
 #include <priv.h>
 #include <pwd.h>
 #include <grp.h>
@@ -49,6 +46,7 @@
 #include <aclutils.h>
 #include <directory.h>
 
+#include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <libzfs.h>
@@ -58,7 +56,6 @@
 #include "libzfs_impl.h"
 #include "zfs_deleg.h"
 
-static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
 static int userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
 
@@ -340,6 +337,44 @@ get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
        return (0);
 }
 
+/*
+ * Utility function to get the received properties of the given object.
+ */
+static int
+get_recvd_props_ioctl(zfs_handle_t *zhp)
+{
+       libzfs_handle_t *hdl = zhp->zfs_hdl;
+       nvlist_t *recvdprops;
+       zfs_cmd_t zc = { 0 };
+       int err;
+
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
+               return (-1);
+
+       (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+       while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
+               if (errno == ENOMEM) {
+                       if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+                               return (-1);
+                       }
+               } else {
+                       zcmd_free_nvlists(&zc);
+                       return (-1);
+               }
+       }
+
+       err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
+       zcmd_free_nvlists(&zc);
+       if (err != 0)
+               return (-1);
+
+       nvlist_free(zhp->zfs_recvd_props);
+       zhp->zfs_recvd_props = recvdprops;
+
+       return (0);
+}
+
 static int
 put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
@@ -401,70 +436,8 @@ zfs_refresh_properties(zfs_handle_t *zhp)
 static int
 make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
-       char *logstr;
-       libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-       /*
-        * Preserve history log string.
-        * any changes performed here will be
-        * logged as an internal event.
-        */
-       logstr = zhp->zfs_hdl->libzfs_log_str;
-       zhp->zfs_hdl->libzfs_log_str = NULL;
-
-top:
-       if (put_stats_zhdl(zhp, zc) != 0) {
-               zhp->zfs_hdl->libzfs_log_str = logstr;
+       if (put_stats_zhdl(zhp, zc) != 0)
                return (-1);
-       }
-
-
-       if (zhp->zfs_dmustats.dds_inconsistent) {
-               zfs_cmd_t zc2 = { 0 };
-
-               /*
-                * If it is dds_inconsistent, then we've caught it in
-                * the middle of a 'zfs receive' or 'zfs destroy', and
-                * it is inconsistent from the ZPL's point of view, so
-                * can't be mounted.  However, it could also be that we
-                * have crashed in the middle of one of those
-                * operations, in which case we need to get rid of the
-                * inconsistent state.  We do that by either rolling
-                * back to the previous snapshot (which will fail if
-                * there is none), or destroying the filesystem.  Note
-                * that if we are still in the middle of an active
-                * 'receive' or 'destroy', then the rollback and destroy
-                * will fail with EBUSY and we will drive on as usual.
-                */
-
-               (void) strlcpy(zc2.zc_name, zhp->zfs_name,
-                   sizeof (zc2.zc_name));
-
-               if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
-                       (void) zvol_remove_link(hdl, zhp->zfs_name);
-                       zc2.zc_objset_type = DMU_OST_ZVOL;
-               } else {
-                       zc2.zc_objset_type = DMU_OST_ZFS;
-               }
-
-               /*
-                * If we can successfully destroy it, pretend that it
-                * never existed.
-                */
-               if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc2) == 0) {
-                       zhp->zfs_hdl->libzfs_log_str = logstr;
-                       errno = ENOENT;
-                       return (-1);
-               }
-               /* If we can successfully roll it back, reset the stats */
-               if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc2) == 0) {
-                       if (get_stats_ioctl(zhp, zc) != 0) {
-                               zhp->zfs_hdl->libzfs_log_str = logstr;
-                               return (-1);
-                       }
-                       goto top;
-               }
-       }
 
        /*
         * We've managed to open the dataset and gather statistics.  Determine
@@ -486,8 +459,9 @@ top:
        else
                abort();        /* we should never see any other types */
 
-       zhp->zfs_hdl->libzfs_log_str = logstr;
-       zhp->zpool_hdl = zpool_handle(zhp);
+       if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
+               return (-1);
+
        return (0);
 }
 
@@ -589,6 +563,7 @@ zfs_close(zfs_handle_t *zhp)
                free(zhp->zfs_mntopts);
        nvlist_free(zhp->zfs_props);
        nvlist_free(zhp->zfs_user_props);
+       nvlist_free(zhp->zfs_recvd_props);
        free(zhp);
 }
 
@@ -882,9 +857,14 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
                                goto error;
                        }
 
+                       /*
+                        * Encode the prop name as
+                        * userquota@<hex-rid>-domain, to make it easy
+                        * for the kernel to decode.
+                        */
                        (void) snprintf(newpropname, sizeof (newpropname),
-                           "%s%s", zfs_userquota_prop_prefixes[uqtype],
-                           domain);
+                           "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
+                           (longlong_t)rid, domain);
                        valary[0] = uqtype;
                        valary[1] = rid;
                        valary[2] = intval;
@@ -960,19 +940,60 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
                        }
                        break;
 
-               case ZFS_PROP_SHAREISCSI:
-                       if (strcmp(strval, "off") != 0 &&
-                           strcmp(strval, "on") != 0 &&
-                           strcmp(strval, "type=disk") != 0) {
-                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "'%s' must be 'on', 'off', or 'type=disk'"),
-                                   propname);
-                               (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
-                               goto error;
+               case ZFS_PROP_MLSLABEL:
+               {
+                       /*
+                        * Verify the mlslabel string and convert to
+                        * internal hex label string.
+                        */
+
+                       m_label_t *new_sl;
+                       char *hex = NULL;       /* internal label string */
+
+                       /* Default value is already OK. */
+                       if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+                               break;
+
+                       /* Verify the label can be converted to binary form */
+                       if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) ||
+                           (str_to_label(strval, &new_sl, MAC_LABEL,
+                           L_NO_CORRECTION, NULL) == -1)) {
+                               goto badlabel;
                        }
 
+                       /* Now translate to hex internal label string */
+                       if (label_to_str(new_sl, &hex, M_INTERNAL,
+                           DEF_NAMES) != 0) {
+                               if (hex)
+                                       free(hex);
+                               goto badlabel;
+                       }
+                       m_label_free(new_sl);
+
+                       /* If string is already in internal form, we're done. */
+                       if (strcmp(strval, hex) == 0) {
+                               free(hex);
+                               break;
+                       }
+
+                       /* Replace the label string with the internal form. */
+                       (void) nvlist_remove(ret, zfs_prop_to_name(prop),
+                           DATA_TYPE_STRING);
+                       verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
+                           hex) == 0);
+                       free(hex);
+
                        break;
 
+badlabel:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "invalid mlslabel '%s'"), strval);
+                       (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+                       m_label_free(new_sl);   /* OK if null */
+                       goto error;
+
+               }
+
                case ZFS_PROP_MOUNTPOINT:
                {
                        namecheck_err_t why;
@@ -1226,6 +1247,90 @@ error:
        return (NULL);
 }
 
+void
+zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
+    char *errbuf)
+{
+       switch (err) {
+
+       case ENOSPC:
+               /*
+                * For quotas and reservations, ENOSPC indicates
+                * something different; setting a quota or reservation
+                * doesn't use any disk space.
+                */
+               switch (prop) {
+               case ZFS_PROP_QUOTA:
+               case ZFS_PROP_REFQUOTA:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "size is less than current used or "
+                           "reserved space"));
+                       (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
+                       break;
+
+               case ZFS_PROP_RESERVATION:
+               case ZFS_PROP_REFRESERVATION:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "size is greater than available space"));
+                       (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
+                       break;
+
+               default:
+                       (void) zfs_standard_error(hdl, err, errbuf);
+                       break;
+               }
+               break;
+
+       case EBUSY:
+               (void) zfs_standard_error(hdl, EBUSY, errbuf);
+               break;
+
+       case EROFS:
+               (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
+               break;
+
+       case ENOTSUP:
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "pool and or dataset must be upgraded to set this "
+                   "property or value"));
+               (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+               break;
+
+       case ERANGE:
+               if (prop == ZFS_PROP_COMPRESSION) {
+                       (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "property setting is not allowed on "
+                           "bootable datasets"));
+                       (void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
+               } else {
+                       (void) zfs_standard_error(hdl, err, errbuf);
+               }
+               break;
+
+       case EINVAL:
+               if (prop == ZPROP_INVAL) {
+                       (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+               } else {
+                       (void) zfs_standard_error(hdl, err, errbuf);
+               }
+               break;
+
+       case EOVERFLOW:
+               /*
+                * This platform can't address a volume this big.
+                */
+#ifdef _ILP32
+               if (prop == ZFS_PROP_VOLSIZE) {
+                       (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
+                       break;
+               }
+#endif
+               /* FALLTHROUGH */
+       default:
+               (void) zfs_standard_error(hdl, err, errbuf);
+       }
+}
+
 /*
  * Given a property name and value, set the property for the given dataset.
  */
@@ -1294,79 +1399,7 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
        ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 
        if (ret != 0) {
-               switch (errno) {
-
-               case ENOSPC:
-                       /*
-                        * For quotas and reservations, ENOSPC indicates
-                        * something different; setting a quota or reservation
-                        * doesn't use any disk space.
-                        */
-                       switch (prop) {
-                       case ZFS_PROP_QUOTA:
-                       case ZFS_PROP_REFQUOTA:
-                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "size is less than current used or "
-                                   "reserved space"));
-                               (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
-                               break;
-
-                       case ZFS_PROP_RESERVATION:
-                       case ZFS_PROP_REFRESERVATION:
-                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "size is greater than available space"));
-                               (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
-                               break;
-
-                       default:
-                               (void) zfs_standard_error(hdl, errno, errbuf);
-                               break;
-                       }
-                       break;
-
-               case EBUSY:
-                       if (prop == ZFS_PROP_VOLBLOCKSIZE)
-                               (void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf);
-                       else
-                               (void) zfs_standard_error(hdl, EBUSY, errbuf);
-                       break;
-
-               case EROFS:
-                       (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
-                       break;
-
-               case ENOTSUP:
-                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "pool and or dataset must be upgraded to set this "
-                           "property or value"));
-                       (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
-                       break;
-
-               case ERANGE:
-                       if (prop == ZFS_PROP_COMPRESSION) {
-                               (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "property setting is not allowed on "
-                                   "bootable datasets"));
-                               (void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
-                       } else {
-                               (void) zfs_standard_error(hdl, errno, errbuf);
-                       }
-                       break;
-
-               case EOVERFLOW:
-                       /*
-                        * This platform can't address a volume this big.
-                        */
-#ifdef _ILP32
-                       if (prop == ZFS_PROP_VOLSIZE) {
-                               (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
-                               break;
-                       }
-#endif
-                       /* FALLTHROUGH */
-               default:
-                       (void) zfs_standard_error(hdl, errno, errbuf);
-               }
+               zfs_setprop_error(hdl, prop, errno, errbuf);
        } else {
                if (do_prefix)
                        ret = changelist_postfix(cl);
@@ -1388,10 +1421,11 @@ error:
 }
 
 /*
- * Given a property, inherit the value from the parent dataset.
+ * Given a property, inherit the value from the parent dataset, or if received
+ * is TRUE, revert to the received value, if any.
  */
 int
-zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
+zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
 {
        zfs_cmd_t zc = { 0 };
        int ret;
@@ -1403,6 +1437,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
 
+       zc.zc_cookie = received;
        if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
                /*
                 * For user properties, the amount of work we have to do is very
@@ -1429,7 +1464,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
        if (zfs_prop_readonly(prop))
                return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
 
-       if (!zfs_prop_inheritable(prop))
+       if (!zfs_prop_inheritable(prop) && !received)
                return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
 
        /*
@@ -1534,6 +1569,26 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
        return (value);
 }
 
+static boolean_t
+zfs_is_recvd_props_mode(zfs_handle_t *zhp)
+{
+       return (zhp->zfs_props == zhp->zfs_recvd_props);
+}
+
+static void
+zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
+{
+       *cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
+       zhp->zfs_props = zhp->zfs_recvd_props;
+}
+
+static void
+zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
+{
+       zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie;
+       *cookie = 0;
+}
+
 /*
  * Internal function for getting a numeric property.  Both zfs_prop_get() and
  * zfs_prop_get_int() are built using this interface.
@@ -1552,6 +1607,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
        struct mnttab mnt;
        char *mntopt_on = NULL;
        char *mntopt_off = NULL;
+       boolean_t received = zfs_is_recvd_props_mode(zhp);
 
        *source = NULL;
 
@@ -1627,6 +1683,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
        case ZFS_PROP_NBMAND:
                *val = getprop_uint64(zhp, prop, source);
 
+               if (received)
+                       break;
+
                if (hasmntopt(&mnt, mntopt_on) && !*val) {
                        *val = B_TRUE;
                        if (src)
@@ -1639,22 +1698,17 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
                break;
 
        case ZFS_PROP_CANMOUNT:
-               *val = getprop_uint64(zhp, prop, source);
-               if (*val != ZFS_CANMOUNT_ON)
-                       *source = zhp->zfs_name;
-               else
-                       *source = "";   /* default */
-               break;
-
+       case ZFS_PROP_VOLSIZE:
        case ZFS_PROP_QUOTA:
        case ZFS_PROP_REFQUOTA:
        case ZFS_PROP_RESERVATION:
        case ZFS_PROP_REFRESERVATION:
                *val = getprop_uint64(zhp, prop, source);
-               if (*val == 0)
-                       *source = "";   /* default */
-               else
+
+               if (*source == NULL) {
+                       /* not default, must be local */
                        *source = zhp->zfs_name;
+               }
                break;
 
        case ZFS_PROP_MOUNTED:
@@ -1696,11 +1750,11 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
                        /*
                         * If we tried to use a default value for a
                         * readonly property, it means that it was not
-                        * present; return an error.
+                        * present.
                         */
                        if (zfs_prop_readonly(prop) &&
-                           *source && (*source)[0] == '\0') {
-                               return (-1);
+                           *source != NULL && (*source)[0] == '\0') {
+                               *source = NULL;
                        }
                        break;
 
@@ -1730,6 +1784,8 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
                *srctype = ZPROP_SRC_NONE;
        } else if (source[0] == '\0') {
                *srctype = ZPROP_SRC_DEFAULT;
+       } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
+               *srctype = ZPROP_SRC_RECEIVED;
        } else {
                if (strcmp(source, zhp->zfs_name) == 0) {
                        *srctype = ZPROP_SRC_LOCAL;
@@ -1741,6 +1797,43 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
 
 }
 
+int
+zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf,
+    size_t proplen, boolean_t literal)
+{
+       zfs_prop_t prop;
+       int err = 0;
+
+       if (zhp->zfs_recvd_props == NULL)
+               if (get_recvd_props_ioctl(zhp) != 0)
+                       return (-1);
+
+       prop = zfs_name_to_prop(propname);
+
+       if (prop != ZPROP_INVAL) {
+               uint64_t cookie;
+               if (!nvlist_exists(zhp->zfs_recvd_props, propname))
+                       return (-1);
+               zfs_set_recvd_props_mode(zhp, &cookie);
+               err = zfs_prop_get(zhp, prop, propbuf, proplen,
+                   NULL, NULL, 0, literal);
+               zfs_unset_recvd_props_mode(zhp, &cookie);
+       } else if (zfs_prop_userquota(propname)) {
+               return (-1);
+       } else {
+               nvlist_t *propval;
+               char *recvdval;
+               if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
+                   propname, &propval) != 0)
+                       return (-1);
+               verify(nvlist_lookup_string(propval, ZPROP_VALUE,
+                   &recvdval) == 0);
+               (void) strlcpy(propbuf, recvdval, proplen);
+       }
+
+       return (err == 0 ? 0 : -1);
+}
+
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
@@ -1756,6 +1849,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
        uint64_t val;
        char *str;
        const char *strval;
+       boolean_t received = zfs_is_recvd_props_mode(zhp);
 
        /*
         * Check to see if this property applies to our object
@@ -1763,6 +1857,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
        if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
                return (-1);
 
+       if (received && zfs_prop_readonly(prop))
+               return (-1);
+
        if (src)
                *src = ZPROP_SRC_NONE;
 
@@ -1802,10 +1899,22 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
                if (str[0] == '/') {
                        char buf[MAXPATHLEN];
                        char *root = buf;
-                       const char *relpath = zhp->zfs_name + strlen(source);
+                       const char *relpath;
 
-                       if (relpath[0] == '/')
-                               relpath++;
+                       /*
+                        * If we inherit the mountpoint, even from a dataset
+                        * with a received value, the source will be the path of
+                        * the dataset we inherit from. If source is
+                        * ZPROP_SOURCE_VAL_RECVD, the received value is not
+                        * inherited.
+                        */
+                       if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
+                               relpath = "";
+                       } else {
+                               relpath = zhp->zfs_name + strlen(source);
+                               if (relpath[0] == '/')
+                                       relpath++;
+                       }
 
                        if ((zpool_get_prop(zhp->zpool_hdl,
                            ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) ||
@@ -1884,8 +1993,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
        case ZFS_PROP_COMPRESSRATIO:
                if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
                        return (-1);
-               (void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
-                   val / 100, (longlong_t)val % 100);
+               (void) snprintf(propbuf, proplen, "%llu.%02llux",
+                   (u_longlong_t)(val / 100),
+                   (u_longlong_t)(val % 100));
                break;
 
        case ZFS_PROP_TYPE:
@@ -1930,6 +2040,44 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
                (void) strlcpy(propbuf, zhp->zfs_name, proplen);
                break;
 
+       case ZFS_PROP_MLSLABEL:
+               {
+                       m_label_t *new_sl = NULL;
+                       char *ascii = NULL;     /* human readable label */
+
+                       (void) strlcpy(propbuf,
+                           getprop_string(zhp, prop, &source), proplen);
+
+                       if (literal || (strcasecmp(propbuf,
+                           ZFS_MLSLABEL_DEFAULT) == 0))
+                               break;
+
+                       /*
+                        * Try to translate the internal hex string to
+                        * human-readable output.  If there are any
+                        * problems just use the hex string.
+                        */
+
+                       if (str_to_label(propbuf, &new_sl, MAC_LABEL,
+                           L_NO_CORRECTION, NULL) == -1) {
+                               m_label_free(new_sl);
+                               break;
+                       }
+
+                       if (label_to_str(new_sl, &ascii, M_LABEL,
+                           DEF_NAMES) != 0) {
+                               if (ascii)
+                                       free(ascii);
+                               m_label_free(new_sl);
+                               break;
+                       }
+                       m_label_free(new_sl);
+
+                       (void) strlcpy(propbuf, ascii, proplen);
+                       free(ascii);
+               }
+               break;
+
        default:
                switch (zfs_prop_get_type(prop)) {
                case PROP_TYPE_NUMBER:
@@ -2371,6 +2519,27 @@ zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 }
 
 /*
+ * Is one dataset name a child dataset of another?
+ *
+ * Needs to handle these cases:
+ * Dataset 1   "a/foo"         "a/foo"         "a/foo"         "a/foo"
+ * Dataset 2   "a/fo"          "a/foobar"      "a/bar/baz"     "a/foo/bar"
+ * Descendant? No.             No.             No.             Yes.
+ */
+static boolean_t
+is_descendant(const char *ds1, const char *ds2)
+{
+       size_t d1len = strlen(ds1);
+
+       /* ds2 can't be a descendant if it's smaller */
+       if (strlen(ds2) < d1len)
+               return (B_FALSE);
+
+       /* otherwise, compare strings and verify that there's a '/' char */
+       return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
+}
+
+/*
  * Given a complete name, return just the portion that refers to the parent.
  * Can return NULL if this is a pool.
  */
@@ -2405,6 +2574,7 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
        char *slash;
        zfs_handle_t *zhp;
        char errbuf[1024];
+       uint64_t is_zoned;
 
        (void) snprintf(errbuf, sizeof (errbuf),
            dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
@@ -2447,9 +2617,12 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
                        return (zfs_standard_error(hdl, errno, errbuf));
        }
 
-       *zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+       is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+       if (zoned != NULL)
+               *zoned = is_zoned;
+
        /* we are in a non-global zone, but parent is in the global zone */
-       if (getzoneid() != GLOBAL_ZONEID && !(*zoned)) {
+       if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
                (void) zfs_standard_error(hdl, EPERM, errbuf);
                zfs_close(zhp);
                return (-1);
@@ -2581,11 +2754,10 @@ int
 zfs_create_ancestors(libzfs_handle_t *hdl, const char *path)
 {
        int prefix;
-       uint64_t zoned;
        char *path_copy;
        int rc;
 
-       if (check_parents(hdl, path, &zoned, B_TRUE, &prefix) != 0)
+       if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
                return (-1);
 
        if ((path_copy = strdup(path)) != NULL) {
@@ -2699,18 +2871,6 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
        /* create the dataset */
        ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc);
 
-       if (ret == 0 && type == ZFS_TYPE_VOLUME) {
-               ret = zvol_create_link(hdl, path);
-               if (ret) {
-                       (void) zfs_standard_error(hdl, errno,
-                           dgettext(TEXT_DOMAIN,
-                           "Volume successfully created, but device links "
-                           "were not created"));
-                       zcmd_free_nvlists(&zc);
-                       return (-1);
-               }
-       }
-
        zcmd_free_nvlists(&zc);
 
        /* check for failure */
@@ -2773,18 +2933,6 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
        if (ZFS_IS_VOLUME(zhp)) {
-               /*
-                * If user doesn't have permissions to unshare volume, then
-                * abort the request.  This would only happen for a
-                * non-privileged user.
-                */
-               if (zfs_unshare_iscsi(zhp) != 0) {
-                       return (-1);
-               }
-
-               if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
-                       return (-1);
-
                zc.zc_objset_type = DMU_OST_ZVOL;
        } else {
                zc.zc_objset_type = DMU_OST_ZFS;
@@ -2809,13 +2957,13 @@ struct destroydata {
 };
 
 static int
-zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
+zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 {
        struct destroydata *dd = arg;
        zfs_handle_t *szhp;
        char name[ZFS_MAXNAMELEN];
        boolean_t closezhp = dd->closezhp;
-       int rv;
+       int rv = 0;
 
        (void) strlcpy(name, zhp->zfs_name, sizeof (name));
        (void) strlcat(name, "@", sizeof (name));
@@ -2827,17 +2975,9 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
                zfs_close(szhp);
        }
 
-       if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-               (void) zvol_remove_link(zhp->zfs_hdl, name);
-               /*
-                * NB: this is simply a best-effort.  We don't want to
-                * return an error, because then we wouldn't visit all
-                * the volumes.
-                */
-       }
-
        dd->closezhp = B_TRUE;
-       rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg);
+       if (!dd->gotone)
+               rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
        if (closezhp)
                zfs_close(zhp);
        return (rv);
@@ -2854,7 +2994,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
        struct destroydata dd = { 0 };
 
        dd.snapname = snapname;
-       (void) zfs_remove_link_cb(zhp, &dd);
+       (void) zfs_check_snap_cb(zhp, &dd);
 
        if (!dd.gotone) {
                return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
@@ -2972,70 +3112,11 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
                        return (zfs_standard_error(zhp->zfs_hdl, errno,
                            errbuf));
                }
-       } else if (ZFS_IS_VOLUME(zhp)) {
-               ret = zvol_create_link(zhp->zfs_hdl, target);
        }
 
        return (ret);
 }
 
-typedef struct promote_data {
-       char cb_mountpoint[MAXPATHLEN];
-       const char *cb_target;
-       const char *cb_errbuf;
-       uint64_t cb_pivot_txg;
-} promote_data_t;
-
-static int
-promote_snap_cb(zfs_handle_t *zhp, void *data)
-{
-       promote_data_t *pd = data;
-       zfs_handle_t *szhp;
-       char snapname[MAXPATHLEN];
-       int rv = 0;
-
-       /* We don't care about snapshots after the pivot point */
-       if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
-               zfs_close(zhp);
-               return (0);
-       }
-
-       /* Remove the device link if it's a zvol. */
-       if (ZFS_IS_VOLUME(zhp))
-               (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
-
-       /* Check for conflicting names */
-       (void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
-       (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
-       szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
-       if (szhp != NULL) {
-               zfs_close(szhp);
-               zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-                   "snapshot name '%s' from origin \n"
-                   "conflicts with '%s' from target"),
-                   zhp->zfs_name, snapname);
-               rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
-       }
-       zfs_close(zhp);
-       return (rv);
-}
-
-static int
-promote_snap_done_cb(zfs_handle_t *zhp, void *data)
-{
-       promote_data_t *pd = data;
-
-       /* We don't care about snapshots after the pivot point */
-       if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
-               /* Create the device link if it's a zvol. */
-               if (ZFS_IS_VOLUME(zhp))
-                       (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
-       }
-
-       zfs_close(zhp);
-       return (0);
-}
-
 /*
  * Promotes the given clone fs to be the clone parent.
  */
@@ -3045,10 +3126,7 @@ zfs_promote(zfs_handle_t *zhp)
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        zfs_cmd_t zc = { 0 };
        char parent[MAXPATHLEN];
-       char *cp;
        int ret;
-       zfs_handle_t *pzhp;
-       promote_data_t pd;
        char errbuf[1024];
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -3066,29 +3144,7 @@ zfs_promote(zfs_handle_t *zhp)
                    "not a cloned filesystem"));
                return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
        }
-       cp = strchr(parent, '@');
-       *cp = '\0';
-
-       /* Walk the snapshots we will be moving */
-       pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
-       if (pzhp == NULL)
-               return (-1);
-       pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
-       zfs_close(pzhp);
-       pd.cb_target = zhp->zfs_name;
-       pd.cb_errbuf = errbuf;
-       pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET);
-       if (pzhp == NULL)
-               return (-1);
-       (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
-           sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
-       ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
-       if (ret != 0) {
-               zfs_close(pzhp);
-               return (-1);
-       }
 
-       /* issue the ioctl */
        (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin,
            sizeof (zc.zc_value));
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -3097,62 +3153,18 @@ zfs_promote(zfs_handle_t *zhp)
        if (ret != 0) {
                int save_errno = errno;
 
-               (void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
-               zfs_close(pzhp);
-
                switch (save_errno) {
                case EEXIST:
-                       /*
-                        * There is a conflicting snapshot name.  We
-                        * should have caught this above, but they could
-                        * have renamed something in the mean time.
-                        */
+                       /* There is a conflicting snapshot name. */
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "conflicting snapshot name from parent '%s'"),
-                           parent);
+                           "conflicting snapshot '%s' from parent '%s'"),
+                           zc.zc_string, parent);
                        return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
                default:
                        return (zfs_standard_error(hdl, save_errno, errbuf));
                }
-       } else {
-               (void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
-       }
-
-       zfs_close(pzhp);
-       return (ret);
-}
-
-struct createdata {
-       const char *cd_snapname;
-       int cd_ifexists;
-};
-
-static int
-zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
-{
-       struct createdata *cd = arg;
-       int ret;
-
-       if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-               char name[MAXPATHLEN];
-
-               (void) strlcpy(name, zhp->zfs_name, sizeof (name));
-               (void) strlcat(name, "@", sizeof (name));
-               (void) strlcat(name, cd->cd_snapname, sizeof (name));
-               (void) zvol_create_link_common(zhp->zfs_hdl, name,
-                   cd->cd_ifexists);
-               /*
-                * NB: this is simply a best-effort.  We don't want to
-                * return an error, because then we wouldn't visit all
-                * the volumes.
-                */
        }
-
-       ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
-
-       zfs_close(zhp);
-
        return (ret);
 }
 
@@ -3216,31 +3228,11 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
         * if it was recursive, the one that actually failed will be in
         * zc.zc_name.
         */
-       if (ret != 0)
+       if (ret != 0) {
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
                    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
-
-       if (ret == 0 && recursive) {
-               struct createdata cd;
-
-               cd.cd_snapname = delim + 1;
-               cd.cd_ifexists = B_FALSE;
-               (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
-       }
-       if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
-               ret = zvol_create_link(zhp->zfs_hdl, path);
-               if (ret != 0) {
-                       (void) zfs_standard_error(hdl, errno,
-                           dgettext(TEXT_DOMAIN,
-                           "Volume successfully snapshotted, but device links "
-                           "were not created"));
-                       zfs_close(zhp);
-                       return (-1);
-               }
-       }
-
-       if (ret != 0)
                (void) zfs_standard_error(hdl, errno, errbuf);
+       }
 
        zfs_close(zhp);
 
@@ -3343,8 +3335,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
         */
 
        if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-               if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
-                       return (-1);
                if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
                        return (-1);
                old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
@@ -3382,10 +3372,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
         */
        if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
            (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
-               if (err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name)) {
-                       zfs_close(zhp);
-                       return (err);
-               }
                if (restore_resv) {
                        new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
                        if (old_volsize != new_volsize)
@@ -3500,14 +3486,11 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 
                if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
                        return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
-               uint64_t unused;
 
                /* validate parents */
-               if (check_parents(hdl, target, &unused, B_FALSE, NULL) != 0)
+               if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
                        return (-1);
 
-               (void) parent_name(target, parent, sizeof (parent));
-
                /* make sure we're in the same pool */
                verify((delim = strchr(target, '/')) != NULL);
                if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
@@ -3518,10 +3501,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
                }
 
                /* new name cannot be a child of the current dataset name */
-               if (strncmp(parent, zhp->zfs_name,
-                   strlen(zhp->zfs_name)) == 0) {
+               if (is_descendant(zhp->zfs_name, target)) {
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "New dataset name cannot be a descendent of "
+                           "New dataset name cannot be a descendant of "
                            "current dataset name"));
                        return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
                }
@@ -3538,7 +3520,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
        }
 
        if (recursive) {
-               struct destroydata dd;
 
                parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
                if (parentname == NULL) {
@@ -3553,15 +3534,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
                        goto error;
                }
 
-               dd.snapname = delim + 1;
-               dd.gotone = B_FALSE;
-               dd.closezhp = B_TRUE;
-
-               /* We remove any zvol links prior to renaming them */
-               ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd);
-               if (ret) {
-                       goto error;
-               }
        } else {
                if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL)
                        return (-1);
@@ -3609,27 +3581,10 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
                 * On failure, we still want to remount any filesystems that
                 * were previously mounted, so we don't alter the system state.
                 */
-               if (recursive) {
-                       struct createdata cd;
-
-                       /* only create links for datasets that had existed */
-                       cd.cd_snapname = delim + 1;
-                       cd.cd_ifexists = B_TRUE;
-                       (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
-                           &cd);
-               } else {
+               if (!recursive)
                        (void) changelist_postfix(cl);
-               }
        } else {
-               if (recursive) {
-                       struct createdata cd;
-
-                       /* only create links for datasets that had existed */
-                       cd.cd_snapname = strchr(target, '@') + 1;
-                       cd.cd_ifexists = B_TRUE;
-                       ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
-                           &cd);
-               } else {
+               if (!recursive) {
                        changelist_rename(cl, zfs_get_name(zhp), target);
                        ret = changelist_postfix(cl);
                }
@@ -3648,143 +3603,19 @@ error:
        return (ret);
 }
 
-/*
- * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
- * poke devfsadm to create the /dev link, and then wait for the link to appear.
- */
-int
-zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
-{
-       return (zvol_create_link_common(hdl, dataset, B_FALSE));
-}
-
-static int
-zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
-{
-       zfs_cmd_t zc = { 0 };
-       di_devlink_handle_t dhdl;
-       priv_set_t *priv_effective;
-       int privileged;
-
-       (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-       /*
-        * Issue the appropriate ioctl.
-        */
-       if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
-               switch (errno) {
-               case EEXIST:
-                       /*
-                        * Silently ignore the case where the link already
-                        * exists.  This allows 'zfs volinit' to be run multiple
-                        * times without errors.
-                        */
-                       return (0);
-
-               case ENOENT:
-                       /*
-                        * Dataset does not exist in the kernel.  If we
-                        * don't care (see zfs_rename), then ignore the
-                        * error quietly.
-                        */
-                       if (ifexists) {
-                               return (0);
-                       }
-
-                       /* FALLTHROUGH */
-
-               default:
-                       return (zfs_standard_error_fmt(hdl, errno,
-                           dgettext(TEXT_DOMAIN, "cannot create device links "
-                           "for '%s'"), dataset));
-               }
-       }
-
-       /*
-        * If privileged call devfsadm and wait for the links to
-        * magically appear.
-        * Otherwise, print out an informational message.
-        */
-
-       priv_effective = priv_allocset();
-       (void) getppriv(PRIV_EFFECTIVE, priv_effective);
-       privileged = (priv_isfullset(priv_effective) == B_TRUE);
-       priv_freeset(priv_effective);
-
-       if (privileged) {
-               if ((dhdl = di_devlink_init(ZFS_DRIVER,
-                   DI_MAKE_LINK)) == NULL) {
-                       zfs_error_aux(hdl, strerror(errno));
-                       (void) zfs_error_fmt(hdl, errno,
-                           dgettext(TEXT_DOMAIN, "cannot create device links "
-                           "for '%s'"), dataset);
-                       (void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
-                       return (-1);
-               } else {
-                       (void) di_devlink_fini(&dhdl);
-               }
-       } else {
-               char pathname[MAXPATHLEN];
-               struct stat64 statbuf;
-               int i;
-
-#define        MAX_WAIT        10
-
-               /*
-                * This is the poor mans way of waiting for the link
-                * to show up.  If after 10 seconds we still don't
-                * have it, then print out a message.
-                */
-               (void) snprintf(pathname, sizeof (pathname), "/dev/zvol/dsk/%s",
-                   dataset);
-
-               for (i = 0; i != MAX_WAIT; i++) {
-                       if (stat64(pathname, &statbuf) == 0)
-                               break;
-                       (void) sleep(1);
-               }
-               if (i == MAX_WAIT)
-                       (void) printf(gettext("%s may not be immediately "
-                           "available\n"), pathname);
-       }
-
-       return (0);
-}
-
-/*
- * Remove a minor node for the given zvol and the associated /dev links.
- */
-int
-zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
+nvlist_t *
+zfs_get_user_props(zfs_handle_t *zhp)
 {
-       zfs_cmd_t zc = { 0 };
-
-       (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-       if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
-               switch (errno) {
-               case ENXIO:
-                       /*
-                        * Silently ignore the case where the link no longer
-                        * exists, so that 'zfs volfini' can be run multiple
-                        * times without errors.
-                        */
-                       return (0);
-
-               default:
-                       return (zfs_standard_error_fmt(hdl, errno,
-                           dgettext(TEXT_DOMAIN, "cannot remove device "
-                           "links for '%s'"), dataset));
-               }
-       }
-
-       return (0);
+       return (zhp->zfs_user_props);
 }
 
 nvlist_t *
-zfs_get_user_props(zfs_handle_t *zhp)
+zfs_get_recvd_props(zfs_handle_t *zhp)
 {
-       return (zhp->zfs_user_props);
+       if (zhp->zfs_recvd_props == NULL)
+               if (get_recvd_props_ioctl(zhp) != 0)
+                       return (NULL);
+       return (zhp->zfs_recvd_props);
 }
 
 /*
@@ -3796,10 +3627,12 @@ zfs_get_user_props(zfs_handle_t *zhp)
  *        for new unique user properties and add them to the list.
  *
  *      - For non fixed-width properties, keep track of the maximum width seen
- *        so that we can size the column appropriately.
+ *        so that we can size the column appropriately. If the user has
+ *        requested received property values, we also need to compute the width
+ *        of the RECEIVED column.
  */
 int
-zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
+zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received)
 {
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        zprop_list_t *entry;
@@ -3870,12 +3703,24 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
                                if (strlen(buf) > entry->pl_width)
                                        entry->pl_width = strlen(buf);
                        }
-               } else if (nvlist_lookup_nvlist(userprops,
-                   entry->pl_user_prop, &propval)  == 0) {
-                       verify(nvlist_lookup_string(propval,
-                           ZPROP_VALUE, &strval) == 0);
-                       if (strlen(strval) > entry->pl_width)
-                               entry->pl_width = strlen(strval);
+                       if (received && zfs_prop_get_recvd(zhp,
+                           zfs_prop_to_name(entry->pl_prop),
+                           buf, sizeof (buf), B_FALSE) == 0)
+                               if (strlen(buf) > entry->pl_recvd_width)
+                                       entry->pl_recvd_width = strlen(buf);
+               } else {
+                       if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
+                           &propval) == 0) {
+                               verify(nvlist_lookup_string(propval,
+                                   ZPROP_VALUE, &strval) == 0);
+                               if (strlen(strval) > entry->pl_width)
+                                       entry->pl_width = strlen(strval);
+                       }
+                       if (received && zfs_prop_get_recvd(zhp,
+                           entry->pl_user_prop,
+                           buf, sizeof (buf), B_FALSE) == 0)
+                               if (strlen(buf) > entry->pl_recvd_width)
+                                       entry->pl_recvd_width = strlen(buf);
                }
        }
 
@@ -3883,52 +3728,6 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
 }
 
 int
-zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred)
-{
-       zfs_cmd_t zc = { 0 };
-       nvlist_t *nvp;
-       gid_t gid;
-       uid_t uid;
-       const gid_t *groups;
-       int group_cnt;
-       int error;
-
-       if (nvlist_alloc(&nvp, NV_UNIQUE_NAME, 0) != 0)
-               return (no_memory(hdl));
-
-       uid = ucred_geteuid(cred);
-       gid = ucred_getegid(cred);
-       group_cnt = ucred_getgroups(cred, &groups);
-
-       if (uid == (uid_t)-1 || gid == (uid_t)-1 || group_cnt == (uid_t)-1)
-               return (1);
-
-       if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_UID, uid) != 0) {
-               nvlist_free(nvp);
-               return (1);
-       }
-
-       if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_GID, gid) != 0) {
-               nvlist_free(nvp);
-               return (1);
-       }
-
-       if (nvlist_add_uint32_array(nvp,
-           ZFS_DELEG_PERM_GROUPS, (uint32_t *)groups, group_cnt) != 0) {
-               nvlist_free(nvp);
-               return (1);
-       }
-       (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-       if (zcmd_write_src_nvlist(hdl, &zc, nvp))
-               return (-1);
-
-       error = ioctl(hdl->libzfs_fd, ZFS_IOC_ISCSI_PERM_CHECK, &zc);
-       nvlist_free(nvp);
-       return (error);
-}
-
-int
 zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
     char *resource, void *export, void *sharetab,
     int sharemax, zfs_share_op_t operation)
@@ -3966,9 +3765,11 @@ zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
                nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);
 
                /*
-                * We leave user:props in the nvlist, so there will be
-                * some ZPROP_INVAL.  To be extra safe, don't prune
-                * those.
+                * User properties will result in ZPROP_INVAL, and since we
+                * only know how to prune standard ZFS properties, we always
+                * leave these in the list.  This can also happen if we
+                * encounter an unknown DSL property (when running older
+                * software, for example).
                 */
                if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
                        (void) nvlist_remove(zhp->zfs_props,
@@ -4097,15 +3898,18 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
 
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
-    boolean_t recursive)
+    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok)
 {
        zfs_cmd_t zc = { 0 };
        libzfs_handle_t *hdl = zhp->zfs_hdl;
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
        (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-       (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string));
+       if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
+           >= sizeof (zc.zc_string))
+               return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
        zc.zc_cookie = recursive;
+       zc.zc_temphold = temphold;
 
        if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
                char errbuf[ZFS_MAXNAMELEN+32];
@@ -4117,6 +3921,14 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
                    "cannot hold '%s@%s'"), zc.zc_name, snapname);
                switch (errno) {
+               case E2BIG:
+                       /*
+                        * Temporary tags wind up having the ds object id
+                        * prepended. So even if we passed the length check
+                        * above, it's still possible for the tag to wind
+                        * up being slightly too long.
+                        */
+                       return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf));
                case ENOTSUP:
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "pool must be upgraded"));
@@ -4125,6 +3937,10 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
                        return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
                case EEXIST:
                        return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+               case ENOENT:
+                       if (enoent_ok)
+                               return (0);
+                       /* FALLTHROUGH */
                default:
                        return (zfs_standard_error_fmt(hdl, errno, errbuf));
                }
@@ -4133,6 +3949,102 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
        return (0);
 }
 
+struct hold_range_arg {
+       zfs_handle_t    *origin;
+       const char      *fromsnap;
+       const char      *tosnap;
+       char            lastsnapheld[ZFS_MAXNAMELEN];
+       const char      *tag;
+       boolean_t       temphold;
+       boolean_t       seento;
+       boolean_t       seenfrom;
+       boolean_t       holding;
+       boolean_t       recursive;
+       snapfilter_cb_t *filter_cb;
+       void            *filter_cb_arg;
+};
+
+static int
+zfs_hold_range_one(zfs_handle_t *zhp, void *arg)
+{
+       struct hold_range_arg *hra = arg;
+       const char *thissnap;
+       int error;
+
+       thissnap = strchr(zfs_get_name(zhp), '@') + 1;
+
+       if (hra->fromsnap && !hra->seenfrom &&
+           strcmp(hra->fromsnap, thissnap) == 0)
+               hra->seenfrom = B_TRUE;
+
+       /* snap is older or newer than the desired range, ignore it */
+       if (hra->seento || !hra->seenfrom) {
+               zfs_close(zhp);
+               return (0);
+       }
+
+       if (!hra->seento && strcmp(hra->tosnap, thissnap) == 0)
+               hra->seento = B_TRUE;
+
+       if (hra->filter_cb != NULL &&
+           hra->filter_cb(zhp, hra->filter_cb_arg) == B_FALSE) {
+               zfs_close(zhp);
+               return (0);
+       }
+
+       if (hra->holding) {
+               /* We could be racing with destroy, so ignore ENOENT. */
+               error = zfs_hold(hra->origin, thissnap, hra->tag,
+                   hra->recursive, hra->temphold, B_TRUE);
+               if (error == 0) {
+                       (void) strlcpy(hra->lastsnapheld, zfs_get_name(zhp),
+                           sizeof (hra->lastsnapheld));
+               }
+       } else {
+               error = zfs_release(hra->origin, thissnap, hra->tag,
+                   hra->recursive);
+       }
+
+       zfs_close(zhp);
+       return (error);
+}
+
+/*
+ * Add a user hold on the set of snapshots starting with fromsnap up to
+ * and including tosnap. If we're unable to to acquire a particular hold,
+ * undo any holds up to that point.
+ */
+int
+zfs_hold_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+    const char *tag, boolean_t recursive, boolean_t temphold,
+    snapfilter_cb_t filter_cb, void *cbarg)
+{
+       struct hold_range_arg arg = { 0 };
+       int error;
+
+       arg.origin = zhp;
+       arg.fromsnap = fromsnap;
+       arg.tosnap = tosnap;
+       arg.tag = tag;
+       arg.temphold = temphold;
+       arg.holding = B_TRUE;
+       arg.recursive = recursive;
+       arg.seenfrom = (fromsnap == NULL);
+       arg.filter_cb = filter_cb;
+       arg.filter_cb_arg = cbarg;
+
+       error = zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg);
+
+       /*
+        * Make sure we either hold the entire range or none.
+        */
+       if (error && arg.lastsnapheld[0] != '\0') {
+               (void) zfs_release_range(zhp, fromsnap,
+                   (const char *)arg.lastsnapheld, tag, recursive);
+       }
+       return (error);
+}
+
 int
 zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive)
@@ -4142,7 +4054,9 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
        (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-       (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string));
+       if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
+           >= sizeof (zc.zc_string))
+               return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
        zc.zc_cookie = recursive;
 
        if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
@@ -4153,7 +4067,8 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
                 * zc.zc_name.
                 */
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-                   "cannot release '%s@%s'"), zc.zc_name, snapname);
+                   "cannot release '%s' from '%s@%s'"), tag, zc.zc_name,
+                   snapname);
                switch (errno) {
                case ESRCH:
                        return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
@@ -4170,3 +4085,61 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
 
        return (0);
 }
+
+/*
+ * Release a user hold from the set of snapshots starting with fromsnap
+ * up to and including tosnap.
+ */
+int
+zfs_release_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+    const char *tag, boolean_t recursive)
+{
+       struct hold_range_arg arg = { 0 };
+
+       arg.origin = zhp;
+       arg.fromsnap = fromsnap;
+       arg.tosnap = tosnap;
+       arg.tag = tag;
+       arg.recursive = recursive;
+       arg.seenfrom = (fromsnap == NULL);
+
+       return (zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg));
+}
+
+uint64_t
+zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
+{
+       uint64_t numdb;
+       uint64_t nblocks, volblocksize;
+       int ncopies;
+       char *strval;
+
+       if (nvlist_lookup_string(props,
+           zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
+               ncopies = atoi(strval);
+       else
+               ncopies = 1;
+       if (nvlist_lookup_uint64(props,
+           zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+           &volblocksize) != 0)
+               volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+       nblocks = volsize/volblocksize;
+       /* start with metadnode L0-L6 */
+       numdb = 7;
+       /* calculate number of indirects */
+       while (nblocks > 1) {
+               nblocks += DNODES_PER_LEVEL - 1;
+               nblocks /= DNODES_PER_LEVEL;
+               numdb += nblocks;
+       }
+       numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
+       volsize *= ncopies;
+       /*
+        * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
+        * compressed, but in practice they compress down to about
+        * 1100 bytes
+        */
+       numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
+       volsize += numdb;
+       return (volsize);
+}
diff --git a/lib/libzfs/libzfs_fru.c b/lib/libzfs/libzfs_fru.c
new file mode 100644 (file)
index 0000000..788fa2c
--- /dev/null
@@ -0,0 +1,452 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <libintl.h>
+#include <link.h>
+#include <pthread.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <libzfs.h>
+
+#include <fm/libtopo.h>
+#include <sys/fm/protocol.h>
+#include <sys/systeminfo.h>
+
+#include "libzfs_impl.h"
+
+/*
+ * This file is responsible for determining the relationship between I/O
+ * devices paths and physical locations.  In the world of MPxIO and external
+ * enclosures, the device path is not synonymous with the physical location.
+ * If you remove a drive and insert it into a different slot, it will end up
+ * with the same path under MPxIO.  If you recable storage enclosures, the
+ * device paths may change.  All of this makes it difficult to implement the
+ * 'autoreplace' property, which is supposed to automatically manage disk
+ * replacement based on physical slot.
+ *
+ * In order to work around these limitations, we have a per-vdev FRU property
+ * that is the libtopo path (minus disk-specific authority information) to the
+ * physical location of the device on the system.  This is an optional
+ * property, and is only needed when using the 'autoreplace' property or when
+ * generating FMA faults against vdevs.
+ */
+
+/*
+ * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case
+ * it is not present.  We only need this once per library instance, so it is
+ * not part of the libzfs handle.
+ */
+static void *_topo_dlhandle;
+static topo_hdl_t *(*_topo_open)(int, const char *, int *);
+static void (*_topo_close)(topo_hdl_t *);
+static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *);
+static void (*_topo_snap_release)(topo_hdl_t *);
+static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *,
+    topo_walk_cb_t, void *, int *);
+static int (*_topo_walk_step)(topo_walk_t *, int);
+static void (*_topo_walk_fini)(topo_walk_t *);
+static void (*_topo_hdl_strfree)(topo_hdl_t *, char *);
+static char *(*_topo_node_name)(tnode_t *);
+static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *,
+    char **, int *);
+static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *);
+static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *);
+static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *,
+    const char *);
+
+#define        ZFS_FRU_HASH_SIZE       257
+
+static size_t
+fru_strhash(const char *key)
+{
+       ulong_t g, h = 0;
+       const char *p;
+
+       for (p = key; *p != '\0'; p++) {
+               h = (h << 4) + *p;
+
+               if ((g = (h & 0xf0000000)) != 0) {
+                       h ^= (g >> 24);
+                       h ^= g;
+               }
+       }
+
+       return (h % ZFS_FRU_HASH_SIZE);
+}
+
+static int
+libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg)
+{
+       libzfs_handle_t *hdl = arg;
+       nvlist_t *fru;
+       char *devpath, *frustr;
+       int err;
+       libzfs_fru_t *frup;
+       size_t idx;
+
+       /*
+        * If this is the chassis node, and we don't yet have the system
+        * chassis ID, then fill in this value now.
+        */
+       if (hdl->libzfs_chassis_id[0] == '\0' &&
+           strcmp(_topo_node_name(tn), "chassis") == 0) {
+               if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY,
+                   FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0)
+                       (void) strlcpy(hdl->libzfs_chassis_id, devpath,
+                           sizeof (hdl->libzfs_chassis_id));
+       }
+
+       /*
+        * Skip non-disk nodes.
+        */
+       if (strcmp(_topo_node_name(tn), "disk") != 0)
+               return (TOPO_WALK_NEXT);
+
+       /*
+        * Get the devfs path and FRU.
+        */
+       if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0)
+               return (TOPO_WALK_NEXT);
+
+       if (libzfs_fru_lookup(hdl, devpath) != NULL) {
+               _topo_hdl_strfree(thp, devpath);
+               return (TOPO_WALK_NEXT);
+       }
+
+       if (_topo_node_fru(tn, &fru, NULL, &err) != 0) {
+               _topo_hdl_strfree(thp, devpath);
+               return (TOPO_WALK_NEXT);
+       }
+
+       /*
+        * Convert the FRU into a string.
+        */
+       if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) {
+               nvlist_free(fru);
+               _topo_hdl_strfree(thp, devpath);
+               return (TOPO_WALK_NEXT);
+       }
+
+       nvlist_free(fru);
+
+       /*
+        * Finally, we have a FRU string and device path.  Add it to the hash.
+        */
+       if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) {
+               _topo_hdl_strfree(thp, devpath);
+               _topo_hdl_strfree(thp, frustr);
+               return (TOPO_WALK_NEXT);
+       }
+
+       if ((frup->zf_device = strdup(devpath)) == NULL ||
+           (frup->zf_fru = strdup(frustr)) == NULL) {
+               free(frup->zf_device);
+               free(frup);
+               _topo_hdl_strfree(thp, devpath);
+               _topo_hdl_strfree(thp, frustr);
+               return (TOPO_WALK_NEXT);
+       }
+
+       _topo_hdl_strfree(thp, devpath);
+       _topo_hdl_strfree(thp, frustr);
+
+       idx = fru_strhash(frup->zf_device);
+       frup->zf_chain = hdl->libzfs_fru_hash[idx];
+       hdl->libzfs_fru_hash[idx] = frup;
+       frup->zf_next = hdl->libzfs_fru_list;
+       hdl->libzfs_fru_list = frup;
+
+       return (TOPO_WALK_NEXT);
+}
+
+/*
+ * Called during initialization to setup the dynamic libtopo connection.
+ */
+#pragma init(libzfs_init_fru)
+static void
+libzfs_init_fru(void)
+{
+       char path[MAXPATHLEN];
+       char isa[257];
+
+#if defined(_LP64)
+       if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0)
+               isa[0] = '\0';
+#else
+       isa[0] = '\0';
+#endif
+       (void) snprintf(path, sizeof (path),
+           "/usr/lib/fm/%s/libtopo.so", isa);
+
+       if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL)
+               return;
+
+       _topo_open = (topo_hdl_t *(*)())
+           dlsym(_topo_dlhandle, "topo_open");
+       _topo_close = (void (*)())
+           dlsym(_topo_dlhandle, "topo_close");
+       _topo_snap_hold = (char *(*)())
+           dlsym(_topo_dlhandle, "topo_snap_hold");
+       _topo_snap_release = (void (*)())
+           dlsym(_topo_dlhandle, "topo_snap_release");
+       _topo_walk_init = (topo_walk_t *(*)())
+           dlsym(_topo_dlhandle, "topo_walk_init");
+       _topo_walk_step = (int (*)())
+           dlsym(_topo_dlhandle, "topo_walk_step");
+       _topo_walk_fini = (void (*)())
+           dlsym(_topo_dlhandle, "topo_walk_fini");
+       _topo_hdl_strfree = (void (*)())
+           dlsym(_topo_dlhandle, "topo_hdl_strfree");
+       _topo_node_name = (char *(*)())
+           dlsym(_topo_dlhandle, "topo_node_name");
+       _topo_prop_get_string = (int (*)())
+           dlsym(_topo_dlhandle, "topo_prop_get_string");
+       _topo_node_fru = (int (*)())
+           dlsym(_topo_dlhandle, "topo_node_fru");
+       _topo_fmri_nvl2str = (int (*)())
+           dlsym(_topo_dlhandle, "topo_fmri_nvl2str");
+       _topo_fmri_strcmp_noauth = (int (*)())
+           dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth");
+
+       if (_topo_open == NULL || _topo_close == NULL ||
+           _topo_snap_hold == NULL || _topo_snap_release == NULL ||
+           _topo_walk_init == NULL || _topo_walk_step == NULL ||
+           _topo_walk_fini == NULL || _topo_hdl_strfree == NULL ||
+           _topo_node_name == NULL || _topo_prop_get_string == NULL ||
+           _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL ||
+           _topo_fmri_strcmp_noauth == NULL) {
+               (void) dlclose(_topo_dlhandle);
+               _topo_dlhandle = NULL;
+       }
+}
+
+/*
+ * Refresh the mappings from device path -> FMRI.  We do this by walking the
+ * hc topology looking for disk nodes, and recording the io/devfs-path and FRU.
+ * Note that we strip out the disk-specific authority information (serial,
+ * part, revision, etc) so that we are left with only the identifying
+ * characteristics of the slot (hc path and chassis-id).
+ */
+void
+libzfs_fru_refresh(libzfs_handle_t *hdl)
+{
+       int err;
+       char *uuid;
+       topo_hdl_t *thp;
+       topo_walk_t *twp;
+
+       if (_topo_dlhandle == NULL)
+               return;
+
+       /*
+        * Clear the FRU hash and initialize our basic structures.
+        */
+       libzfs_fru_clear(hdl, B_FALSE);
+
+       if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION,
+           NULL, &err)) == NULL)
+               return;
+
+       thp = hdl->libzfs_topo_hdl;
+
+       if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL)
+               return;
+
+       _topo_hdl_strfree(thp, uuid);
+
+       if (hdl->libzfs_fru_hash == NULL &&
+           (hdl->libzfs_fru_hash =
+           calloc(ZFS_FRU_HASH_SIZE * sizeof (void *), 1)) == NULL)
+               return;
+
+       /*
+        * We now have a topo snapshot, so iterate over the hc topology looking
+        * for disks to add to the hash.
+        */
+       twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC,
+           libzfs_fru_gather, hdl, &err);
+       if (twp != NULL) {
+               (void) _topo_walk_step(twp, TOPO_WALK_CHILD);
+               _topo_walk_fini(twp);
+       }
+}
+
+/*
+ * Given a devfs path, return the FRU for the device, if known.  This will
+ * automatically call libzfs_fru_refresh() if it hasn't already been called by
+ * the consumer.  The string returned is valid until the next call to
+ * libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath)
+{
+       size_t idx = fru_strhash(devpath);
+       libzfs_fru_t *frup;
+
+       if (hdl->libzfs_fru_hash == NULL)
+               libzfs_fru_refresh(hdl);
+
+       if (hdl->libzfs_fru_hash == NULL)
+               return (NULL);
+
+       for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+           frup = frup->zf_chain) {
+               if (strcmp(devpath, frup->zf_device) == 0)
+                       return (frup->zf_fru);
+       }
+
+       return (NULL);
+}
+
+/*
+ * Given a fru path, return the device path.  This will automatically call
+ * libzfs_fru_refresh() if it hasn't already been called by the consumer.  The
+ * string returned is valid until the next call to libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru)
+{
+       libzfs_fru_t *frup;
+       size_t idx;
+
+       if (hdl->libzfs_fru_hash == NULL)
+               libzfs_fru_refresh(hdl);
+
+       if (hdl->libzfs_fru_hash == NULL)
+               return (NULL);
+
+       for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) {
+               for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+                   frup = frup->zf_next) {
+                       if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl,
+                           fru, frup->zf_fru))
+                               return (frup->zf_device);
+               }
+       }
+
+       return (NULL);
+}
+
+/*
+ * Change the stored FRU for the given vdev.
+ */
+int
+zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru)
+{
+       zfs_cmd_t zc = { 0 };
+
+       (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+       (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value));
+       zc.zc_guid = vdev_guid;
+
+       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0)
+               return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
+                   dgettext(TEXT_DOMAIN, "cannot set FRU")));
+
+       return (0);
+}
+
+/*
+ * Compare to two FRUs, ignoring any authority information.
+ */
+boolean_t
+libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b)
+{
+       if (hdl->libzfs_fru_hash == NULL)
+               libzfs_fru_refresh(hdl);
+
+       if (hdl->libzfs_fru_hash == NULL)
+               return (strcmp(a, b) == 0);
+
+       return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b));
+}
+
+/*
+ * This special function checks to see whether the FRU indicates it's supposed
+ * to be in the system chassis, but the chassis-id doesn't match.  This can
+ * happen in a clustered case, where both head nodes have the same logical
+ * disk, but opening the device on the other head node is meaningless.
+ */
+boolean_t
+libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru)
+{
+       const char *chassisid;
+       size_t len;
+
+       if (hdl->libzfs_fru_hash == NULL)
+               libzfs_fru_refresh(hdl);
+
+       if (hdl->libzfs_chassis_id[0] == '\0')
+               return (B_FALSE);
+
+       if (strstr(fru, "/chassis=0/") == NULL)
+               return (B_FALSE);
+
+       if ((chassisid = strstr(fru, ":chassis-id=")) == NULL)
+               return (B_FALSE);
+
+       chassisid += 12;
+       len = strlen(hdl->libzfs_chassis_id);
+       if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 &&
+           (chassisid[len] == '/' || chassisid[len] == ':'))
+               return (B_FALSE);
+
+       return (B_TRUE);
+}
+
+/*
+ * Clear memory associated with the FRU hash.
+ */
+void
+libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final)
+{
+       libzfs_fru_t *frup;
+
+       while ((frup = hdl->libzfs_fru_list) != NULL) {
+               hdl->libzfs_fru_list = frup->zf_next;
+               free(frup->zf_device);
+               free(frup->zf_fru);
+               free(frup);
+       }
+
+       hdl->libzfs_fru_list = NULL;
+
+       if (hdl->libzfs_topo_hdl != NULL) {
+               _topo_snap_release(hdl->libzfs_topo_hdl);
+               _topo_close(hdl->libzfs_topo_hdl);
+               hdl->libzfs_topo_hdl = NULL;
+       }
+
+       if (final) {
+               free(hdl->libzfs_fru_hash);
+       } else if (hdl->libzfs_fru_hash != NULL) {
+               bzero(hdl->libzfs_fru_hash,
+                   ZFS_FRU_HASH_SIZE * sizeof (void *));
+       }
+}
index d677768..fd3044b 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 /*
  * Pool import support functions.
  *
  * using our derived config, and record the results.
  */
 
+#include <ctype.h>
 #include <devid.h>
 #include <dirent.h>
 #include <errno.h>
 #include <libintl.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/vtoc.h>
+#include <sys/dktp/fdisk.h>
+#include <sys/efi_partition.h>
+#include <thread_pool.h>
 
 #include <sys/vdev_impl.h>
 
@@ -388,8 +392,6 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
        }
 
        if (err) {
-               (void) zpool_standard_error(hdl, errno,
-                   dgettext(TEXT_DOMAIN, "cannot discover pools"));
                zcmd_free_nvlists(&zc);
                return (NULL);
        }
@@ -404,6 +406,21 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 }
 
 /*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+       for (int c = 0; c < holes; c++) {
+
+               /* Top-level is a hole */
+               if (hole_array[c] == id)
+                       return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
+/*
  * Convert our list of pools into the definitive set of configurations.  We
  * start by picking the best config for each toplevel vdev.  Once that's done,
  * we assemble the toplevel vdevs into a full config for the pool.  We make a
@@ -425,17 +442,20 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
        uint64_t version, guid;
        uint_t children = 0;
        nvlist_t **child = NULL;
+       uint_t holes;
+       uint64_t *hole_array, max_id;
        uint_t c;
        boolean_t isactive;
        uint64_t hostid;
        nvlist_t *nvl;
        boolean_t found_one = B_FALSE;
+       boolean_t valid_top_config = B_FALSE;
 
        if (nvlist_alloc(&ret, 0, 0) != 0)
                goto nomem;
 
        for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
-               uint64_t id;
+               uint64_t id, max_txg = 0;
 
                if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
                        goto nomem;
@@ -463,6 +483,42 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
                                }
                        }
 
+                       /*
+                        * We rely on the fact that the max txg for the
+                        * pool will contain the most up-to-date information
+                        * about the valid top-levels in the vdev namespace.
+                        */
+                       if (best_txg > max_txg) {
+                               (void) nvlist_remove(config,
+                                   ZPOOL_CONFIG_VDEV_CHILDREN,
+                                   DATA_TYPE_UINT64);
+                               (void) nvlist_remove(config,
+                                   ZPOOL_CONFIG_HOLE_ARRAY,
+                                   DATA_TYPE_UINT64_ARRAY);
+
+                               max_txg = best_txg;
+                               hole_array = NULL;
+                               holes = 0;
+                               max_id = 0;
+                               valid_top_config = B_FALSE;
+
+                               if (nvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+                                       verify(nvlist_add_uint64(config,
+                                           ZPOOL_CONFIG_VDEV_CHILDREN,
+                                           max_id) == 0);
+                                       valid_top_config = B_TRUE;
+                               }
+
+                               if (nvlist_lookup_uint64_array(tmp,
+                                   ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+                                   &holes) == 0) {
+                                       verify(nvlist_add_uint64_array(config,
+                                           ZPOOL_CONFIG_HOLE_ARRAY,
+                                           hole_array, holes) == 0);
+                               }
+                       }
+
                        if (!config_seen) {
                                /*
                                 * Copy the relevant pieces of data to the pool
@@ -522,6 +578,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
                            ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
                        verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
                            &id) == 0);
+
                        if (id >= children) {
                                nvlist_t **newchild;
 
@@ -542,17 +599,82 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 
                }
 
+               /*
+                * If we have information about all the top-levels then
+                * clean up the nvlist which we've constructed. This
+                * means removing any extraneous devices that are
+                * beyond the valid range or adding devices to the end
+                * of our array which appear to be missing.
+                */
+               if (valid_top_config) {
+                       if (max_id < children) {
+                               for (c = max_id; c < children; c++)
+                                       nvlist_free(child[c]);
+                               children = max_id;
+                       } else if (max_id > children) {
+                               nvlist_t **newchild;
+
+                               newchild = zfs_alloc(hdl, (max_id) *
+                                   sizeof (nvlist_t *));
+                               if (newchild == NULL)
+                                       goto nomem;
+
+                               for (c = 0; c < children; c++)
+                                       newchild[c] = child[c];
+
+                               free(child);
+                               child = newchild;
+                               children = max_id;
+                       }
+               }
+
                verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
                    &guid) == 0);
 
                /*
+                * The vdev namespace may contain holes as a result of
+                * device removal. We must add them back into the vdev
+                * tree before we process any missing devices.
+                */
+               if (holes > 0) {
+                       ASSERT(valid_top_config);
+
+                       for (c = 0; c < children; c++) {
+                               nvlist_t *holey;
+
+                               if (child[c] != NULL ||
+                                   !vdev_is_hole(hole_array, holes, c))
+                                       continue;
+
+                               if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+                                   0) != 0)
+                                       goto nomem;
+
+                               /*
+                                * Holes in the namespace are treated as
+                                * "hole" top-level vdevs and have a
+                                * special flag set on them.
+                                */
+                               if (nvlist_add_string(holey,
+                                   ZPOOL_CONFIG_TYPE,
+                                   VDEV_TYPE_HOLE) != 0 ||
+                                   nvlist_add_uint64(holey,
+                                   ZPOOL_CONFIG_ID, c) != 0 ||
+                                   nvlist_add_uint64(holey,
+                                   ZPOOL_CONFIG_GUID, 0ULL) != 0)
+                                       goto nomem;
+                               child[c] = holey;
+                       }
+               }
+
+               /*
                 * Look for any missing top-level vdevs.  If this is the case,
                 * create a faked up 'missing' vdev as a placeholder.  We cannot
                 * simply compress the child array, because the kernel performs
                 * certain checks to make sure the vdev IDs match their location
                 * in the configuration.
                 */
-               for (c = 0; c < children; c++)
+               for (c = 0; c < children; c++) {
                        if (child[c] == NULL) {
                                nvlist_t *missing;
                                if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
@@ -570,6 +692,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
                                }
                                child[c] = missing;
                        }
+               }
 
                /*
                 * Put all of this pool's top-level vdevs into a root vdev.
@@ -636,8 +759,11 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
                        continue;
                }
 
-               if ((nvl = refresh_config(hdl, config)) == NULL)
-                       goto error;
+               if ((nvl = refresh_config(hdl, config)) == NULL) {
+                       nvlist_free(config);
+                       config = NULL;
+                       continue;
+               }
 
                nvlist_free(config);
                config = nvl;
@@ -777,6 +903,212 @@ zpool_read_label(int fd, nvlist_t **config)
        return (0);
 }
 
+typedef struct rdsk_node {
+       char *rn_name;
+       int rn_dfd;
+       libzfs_handle_t *rn_hdl;
+       nvlist_t *rn_config;
+       avl_tree_t *rn_avl;
+       avl_node_t rn_node;
+       boolean_t rn_nozpool;
+} rdsk_node_t;
+
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+       const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+       const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+       char *nm1slice, *nm2slice;
+       int rv;
+
+       /*
+        * slices zero and two are the most likely to provide results,
+        * so put those first
+        */
+       nm1slice = strstr(nm1, "s0");
+       nm2slice = strstr(nm2, "s0");
+       if (nm1slice && !nm2slice) {
+               return (-1);
+       }
+       if (!nm1slice && nm2slice) {
+               return (1);
+       }
+       nm1slice = strstr(nm1, "s2");
+       nm2slice = strstr(nm2, "s2");
+       if (nm1slice && !nm2slice) {
+               return (-1);
+       }
+       if (!nm1slice && nm2slice) {
+               return (1);
+       }
+
+       rv = strcmp(nm1, nm2);
+       if (rv == 0)
+               return (0);
+       return (rv > 0 ? 1 : -1);
+}
+
+static void
+check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
+    diskaddr_t size, uint_t blksz)
+{
+       rdsk_node_t tmpnode;
+       rdsk_node_t *node;
+       char sname[MAXNAMELEN];
+
+       tmpnode.rn_name = &sname[0];
+       (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
+           diskname, partno);
+       /*
+        * protect against division by zero for disk labels that
+        * contain a bogus sector size
+        */
+       if (blksz == 0)
+               blksz = DEV_BSIZE;
+       /* too small to contain a zpool? */
+       if ((size < (SPA_MINDEVSIZE / blksz)) &&
+           (node = avl_find(r, &tmpnode, NULL)))
+               node->rn_nozpool = B_TRUE;
+}
+
+static void
+nozpool_all_slices(avl_tree_t *r, const char *sname)
+{
+       char diskname[MAXNAMELEN];
+       char *ptr;
+       int i;
+
+       (void) strncpy(diskname, sname, MAXNAMELEN);
+       if (((ptr = strrchr(diskname, 's')) == NULL) &&
+           ((ptr = strrchr(diskname, 'p')) == NULL))
+               return;
+       ptr[0] = 's';
+       ptr[1] = '\0';
+       for (i = 0; i < NDKMAP; i++)
+               check_one_slice(r, diskname, i, 0, 1);
+       ptr[0] = 'p';
+       for (i = 0; i <= FD_NUMPART; i++)
+               check_one_slice(r, diskname, i, 0, 1);
+}
+
+static void
+check_slices(avl_tree_t *r, int fd, const char *sname)
+{
+       struct extvtoc vtoc;
+       struct dk_gpt *gpt;
+       char diskname[MAXNAMELEN];
+       char *ptr;
+       int i;
+
+       (void) strncpy(diskname, sname, MAXNAMELEN);
+       if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
+               return;
+       ptr[1] = '\0';
+
+       if (read_extvtoc(fd, &vtoc) >= 0) {
+               for (i = 0; i < NDKMAP; i++)
+                       check_one_slice(r, diskname, i,
+                           vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+       } else if (efi_alloc_and_read(fd, &gpt) >= 0) {
+               /*
+                * on x86 we'll still have leftover links that point
+                * to slices s[9-15], so use NDKMAP instead
+                */
+               for (i = 0; i < NDKMAP; i++)
+                       check_one_slice(r, diskname, i,
+                           gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+               /* nodes p[1-4] are never used with EFI labels */
+               ptr[0] = 'p';
+               for (i = 1; i <= FD_NUMPART; i++)
+                       check_one_slice(r, diskname, i, 0, 1);
+               efi_free(gpt);
+       }
+}
+
+static void
+zpool_open_func(void *arg)
+{
+       rdsk_node_t *rn = arg;
+       struct stat64 statbuf;
+       nvlist_t *config;
+       int fd;
+
+       if (rn->rn_nozpool)
+               return;
+       if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+               /* symlink to a device that's no longer there */
+               if (errno == ENOENT)
+                       nozpool_all_slices(rn->rn_avl, rn->rn_name);
+               return;
+       }
+       /*
+        * Ignore failed stats.  We only want regular
+        * files, character devs and block devs.
+        */
+       if (fstat64(fd, &statbuf) != 0 ||
+           (!S_ISREG(statbuf.st_mode) &&
+           !S_ISCHR(statbuf.st_mode) &&
+           !S_ISBLK(statbuf.st_mode))) {
+               (void) close(fd);
+               return;
+       }
+       /* this file is too small to hold a zpool */
+       if (S_ISREG(statbuf.st_mode) &&
+           statbuf.st_size < SPA_MINDEVSIZE) {
+               (void) close(fd);
+               return;
+       } else if (!S_ISREG(statbuf.st_mode)) {
+               /*
+                * Try to read the disk label first so we don't have to
+                * open a bunch of minor nodes that can't have a zpool.
+                */
+               check_slices(rn->rn_avl, fd, rn->rn_name);
+       }
+
+       if ((zpool_read_label(fd, &config)) != 0) {
+               (void) close(fd);
+               (void) no_memory(rn->rn_hdl);
+               return;
+       }
+       (void) close(fd);
+
+
+       rn->rn_config = config;
+       if (config != NULL) {
+               assert(rn->rn_nozpool == B_FALSE);
+       }
+}
+
+/*
+ * Given a file descriptor, clear (zero) the label information.  This function
+ * is currently only used in the appliance stack as part of the ZFS sysevent
+ * module.
+ */
+int
+zpool_clear_label(int fd)
+{
+       struct stat64 statbuf;
+       int l;
+       vdev_label_t *label;
+       uint64_t size;
+
+       if (fstat64(fd, &statbuf) == -1)
+               return (0);
+       size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
+
+       if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
+               return (-1);
+
+       for (l = 0; l < VDEV_LABELS; l++) {
+               if (pwrite64(fd, label, sizeof (vdev_label_t),
+                   label_offset(size, l)) != sizeof (vdev_label_t))
+                       return (-1);
+       }
+
+       free(label);
+       return (0);
+}
+
 /*
  * Given a list of directories to search, find all pools stored on disk.  This
  * includes partial pools which are not available to import.  If no args are
@@ -785,30 +1117,28 @@ zpool_read_label(int fd, nvlist_t **config)
  * to import a specific pool.
  */
 static nvlist_t *
-zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
-    boolean_t active_ok, char *poolname, uint64_t guid)
+zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 {
-       int i;
+       int i, dirs = iarg->paths;
        DIR *dirp = NULL;
        struct dirent64 *dp;
        char path[MAXPATHLEN];
-       char *end;
+       char *end, **dir = iarg->path;
        size_t pathleft;
-       struct stat64 statbuf;
-       nvlist_t *ret = NULL, *config;
+       nvlist_t *ret = NULL;
        static char *default_dir = "/dev/dsk";
-       int fd;
        pool_list_t pools = { 0 };
        pool_entry_t *pe, *penext;
        vdev_entry_t *ve, *venext;
        config_entry_t *ce, *cenext;
        name_entry_t *ne, *nenext;
+       avl_tree_t slice_cache;
+       rdsk_node_t *slice;
+       void *cookie;
 
-       verify(poolname == NULL || guid == 0);
-
-       if (argc == 0) {
-               argc = 1;
-               argv = &default_dir;
+       if (dirs == 0) {
+               dirs = 1;
+               dir = &default_dir;
        }
 
        /*
@@ -816,15 +1146,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
         * possible device, organizing the information according to pool GUID
         * and toplevel GUID.
         */
-       for (i = 0; i < argc; i++) {
+       for (i = 0; i < dirs; i++) {
+               tpool_t *t;
                char *rdsk;
                int dfd;
 
                /* use realpath to normalize the path */
-               if (realpath(argv[i], path) == 0) {
+               if (realpath(dir[i], path) == 0) {
                        (void) zfs_error_fmt(hdl, EZFS_BADPATH,
-                           dgettext(TEXT_DOMAIN, "cannot open '%s'"),
-                           argv[i]);
+                           dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
                        goto error;
                }
                end = &path[strlen(path)];
@@ -851,6 +1181,8 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
                        goto error;
                }
 
+               avl_create(&slice_cache, slice_cache_compare,
+                   sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
                /*
                 * This is not MT-safe, but we have no MT consumers of libzfs
                 */
@@ -860,46 +1192,53 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
                            (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
                                continue;
 
-                       if ((fd = openat64(dfd, name, O_RDONLY)) < 0)
-                               continue;
-
-                       /*
-                        * Ignore failed stats.  We only want regular
-                        * files, character devs and block devs.
-                        */
-                       if (fstat64(fd, &statbuf) != 0 ||
-                           (!S_ISREG(statbuf.st_mode) &&
-                           !S_ISCHR(statbuf.st_mode) &&
-                           !S_ISBLK(statbuf.st_mode))) {
-                               (void) close(fd);
-                               continue;
-                       }
-
-                       if ((zpool_read_label(fd, &config)) != 0) {
-                               (void) close(fd);
-                               (void) no_memory(hdl);
-                               goto error;
-                       }
-
-                       (void) close(fd);
-
-                       if (config != NULL) {
+                       slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+                       slice->rn_name = zfs_strdup(hdl, name);
+                       slice->rn_avl = &slice_cache;
+                       slice->rn_dfd = dfd;
+                       slice->rn_hdl = hdl;
+                       slice->rn_nozpool = B_FALSE;
+                       avl_add(&slice_cache, slice);
+               }
+               /*
+                * create a thread pool to do all of this in parallel;
+                * rn_nozpool is not protected, so this is racy in that
+                * multiple tasks could decide that the same slice can
+                * not hold a zpool, which is benign.  Also choose
+                * double the number of processors; we hold a lot of
+                * locks in the kernel, so going beyond this doesn't
+                * buy us much.
+                */
+               t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
+                   0, NULL);
+               for (slice = avl_first(&slice_cache); slice;
+                   (slice = avl_walk(&slice_cache, slice,
+                   AVL_AFTER)))
+                       (void) tpool_dispatch(t, zpool_open_func, slice);
+               tpool_wait(t);
+               tpool_destroy(t);
+
+               cookie = NULL;
+               while ((slice = avl_destroy_nodes(&slice_cache,
+                   &cookie)) != NULL) {
+                       if (slice->rn_config != NULL) {
+                               nvlist_t *config = slice->rn_config;
                                boolean_t matched = B_TRUE;
 
-                               if (poolname != NULL) {
+                               if (iarg->poolname != NULL) {
                                        char *pname;
 
                                        matched = nvlist_lookup_string(config,
                                            ZPOOL_CONFIG_POOL_NAME,
                                            &pname) == 0 &&
-                                           strcmp(poolname, pname) == 0;
-                               } else if (guid != 0) {
+                                           strcmp(iarg->poolname, pname) == 0;
+                               } else if (iarg->guid != 0) {
                                        uint64_t this_guid;
 
                                        matched = nvlist_lookup_uint64(config,
                                            ZPOOL_CONFIG_POOL_GUID,
                                            &this_guid) == 0 &&
-                                           guid == this_guid;
+                                           iarg->guid == this_guid;
                                }
                                if (!matched) {
                                        nvlist_free(config);
@@ -907,17 +1246,20 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
                                        continue;
                                }
                                /* use the non-raw path for the config */
-                               (void) strlcpy(end, name, pathleft);
+                               (void) strlcpy(end, slice->rn_name, pathleft);
                                if (add_config(hdl, &pools, path, config) != 0)
                                        goto error;
                        }
+                       free(slice->rn_name);
+                       free(slice);
                }
+               avl_destroy(&slice_cache);
 
                (void) closedir(dirp);
                dirp = NULL;
        }
 
-       ret = get_configs(hdl, &pools, active_ok);
+       ret = get_configs(hdl, &pools, iarg->can_be_active);
 
 error:
        for (pe = pools.pools; pe != NULL; pe = penext) {
@@ -951,27 +1293,12 @@ error:
 nvlist_t *
 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
 {
-       return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, 0));
-}
+       importargs_t iarg = { 0 };
 
-nvlist_t *
-zpool_find_import_byname(libzfs_handle_t *hdl, int argc, char **argv,
-    char *pool)
-{
-       return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, pool, 0));
-}
+       iarg.paths = argc;
+       iarg.path = argv;
 
-nvlist_t *
-zpool_find_import_byguid(libzfs_handle_t *hdl, int argc, char **argv,
-    uint64_t guid)
-{
-       return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, guid));
-}
-
-nvlist_t *
-zpool_find_import_activeok(libzfs_handle_t *hdl, int argc, char **argv)
-{
-       return (zpool_find_import_impl(hdl, argc, argv, B_TRUE, NULL, 0));
+       return (zpool_find_import_impl(hdl, &iarg));
 }
 
 /*
@@ -1093,6 +1420,46 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
        return (pools);
 }
 
+static int
+name_or_guid_exists(zpool_handle_t *zhp, void *data)
+{
+       importargs_t *import = data;
+       int found = 0;
+
+       if (import->poolname != NULL) {
+               char *pool_name;
+
+               verify(nvlist_lookup_string(zhp->zpool_config,
+                   ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
+               if (strcmp(pool_name, import->poolname) == 0)
+                       found = 1;
+       } else {
+               uint64_t pool_guid;
+
+               verify(nvlist_lookup_uint64(zhp->zpool_config,
+                   ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
+               if (pool_guid == import->guid)
+                       found = 1;
+       }
+
+       zpool_close(zhp);
+       return (found);
+}
+
+nvlist_t *
+zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
+{
+       verify(import->poolname == NULL || import->guid == 0);
+
+       if (import->unique)
+               import->exists = zpool_iter(hdl, name_or_guid_exists, import);
+
+       if (import->cachefile != NULL)
+               return (zpool_find_import_cached(hdl, import->cachefile,
+                   import->poolname, import->guid));
+
+       return (zpool_find_import_impl(hdl, import));
+}
 
 boolean_t
 find_guid(nvlist_t *nv, uint64_t guid)
index 7810e5d..0675ec2 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
  *
  *     zfs_is_shared_nfs()
  *     zfs_is_shared_smb()
- *     zfs_is_shared_iscsi()
  *     zfs_share_proto()
  *     zfs_shareall();
- *     zfs_share_iscsi()
  *     zfs_unshare_nfs()
  *     zfs_unshare_smb()
  *     zfs_unshareall_nfs()
  *     zfs_unshareall_smb()
  *     zfs_unshareall()
  *     zfs_unshareall_bypath()
- *     zfs_unshare_iscsi()
  *
  * The following functions are available for pool consumers, and will
  * mount/unmount and share/unshare all datasets within pool:
@@ -89,11 +85,6 @@ static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
 zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
     zfs_share_proto_t);
 
-static int (*iscsitgt_zfs_share)(const char *);
-static int (*iscsitgt_zfs_unshare)(const char *);
-static int (*iscsitgt_zfs_is_shared)(const char *);
-static int (*iscsitgt_svc_online)();
-
 /*
  * The share protocols table must be in the same order as the zfs_share_prot_t
  * enum in libzfs_impl.h
@@ -125,29 +116,6 @@ zfs_share_proto_t share_all_proto[] = {
        PROTO_END
 };
 
-#pragma init(zfs_iscsi_init)
-static void
-zfs_iscsi_init(void)
-{
-       void *libiscsitgt;
-
-       if ((libiscsitgt = dlopen("/lib/libiscsitgt.so.1",
-           RTLD_LAZY | RTLD_GLOBAL)) == NULL ||
-           (iscsitgt_zfs_share = (int (*)(const char *))dlsym(libiscsitgt,
-           "iscsitgt_zfs_share")) == NULL ||
-           (iscsitgt_zfs_unshare = (int (*)(const char *))dlsym(libiscsitgt,
-           "iscsitgt_zfs_unshare")) == NULL ||
-           (iscsitgt_zfs_is_shared = (int (*)(const char *))dlsym(libiscsitgt,
-           "iscsitgt_zfs_is_shared")) == NULL ||
-           (iscsitgt_svc_online = (int (*)(const char *))dlsym(libiscsitgt,
-           "iscsitgt_svc_online")) == NULL) {
-               iscsitgt_zfs_share = NULL;
-               iscsitgt_zfs_unshare = NULL;
-               iscsitgt_zfs_is_shared = NULL;
-               iscsitgt_svc_online = NULL;
-       }
-}
-
 /*
  * Search the sharetab for the given mountpoint and protocol, returning
  * a zfs_share_type_t value.
@@ -345,6 +313,18 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
                } else if (errno == EPERM) {
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "Insufficient privileges"));
+               } else if (errno == ENOTSUP) {
+                       char buf[256];
+                       int spa_version;
+
+                       VERIFY(zfs_spa_version(zhp, &spa_version) == 0);
+                       (void) snprintf(buf, sizeof (buf),
+                           dgettext(TEXT_DOMAIN, "Can't mount a version %lld "
+                           "file system on a version %d pool. Pool must be"
+                           " upgraded to mount this file system."),
+                           (u_longlong_t)zfs_prop_get_int(zhp,
+                           ZFS_PROP_VERSION), spa_version);
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf));
                } else {
                        zfs_error_aux(hdl, strerror(errno));
                }
@@ -445,7 +425,7 @@ zfs_is_shared(zfs_handle_t *zhp)
        zfs_share_proto_t *curr_proto;
 
        if (ZFS_IS_VOLUME(zhp))
-               return (zfs_is_shared_iscsi(zhp));
+               return (B_FALSE);
 
        for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
            curr_proto++)
@@ -458,7 +438,7 @@ int
 zfs_share(zfs_handle_t *zhp)
 {
        if (ZFS_IS_VOLUME(zhp))
-               return (zfs_share_iscsi(zhp));
+               return (0);
 
        return (zfs_share_proto(zhp, share_all_proto));
 }
@@ -467,7 +447,7 @@ int
 zfs_unshare(zfs_handle_t *zhp)
 {
        if (ZFS_IS_VOLUME(zhp))
-               return (zfs_unshare_iscsi(zhp));
+               return (0);
 
        return (zfs_unshareall(zhp));
 }
@@ -999,81 +979,6 @@ remove_mountpoint(zfs_handle_t *zhp)
        }
 }
 
-boolean_t
-zfs_is_shared_iscsi(zfs_handle_t *zhp)
-{
-
-       /*
-        * If iscsi deamon isn't running then we aren't shared
-        */
-       if (iscsitgt_svc_online && iscsitgt_svc_online() == 1)
-               return (B_FALSE);
-       else
-               return (iscsitgt_zfs_is_shared != NULL &&
-                   iscsitgt_zfs_is_shared(zhp->zfs_name) != 0);
-}
-
-int
-zfs_share_iscsi(zfs_handle_t *zhp)
-{
-       char shareopts[ZFS_MAXPROPLEN];
-       const char *dataset = zhp->zfs_name;
-       libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-       /*
-        * Return success if there are no share options.
-        */
-       if (zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
-           sizeof (shareopts), NULL, NULL, 0, B_FALSE) != 0 ||
-           strcmp(shareopts, "off") == 0)
-               return (0);
-
-       if (iscsitgt_zfs_share == NULL || iscsitgt_zfs_share(dataset) != 0) {
-               int error = EZFS_SHAREISCSIFAILED;
-
-               /*
-                * If service isn't availabele and EPERM was
-                * returned then use special error.
-                */
-               if (iscsitgt_svc_online && errno == EPERM &&
-                   (iscsitgt_svc_online() != 0))
-                       error = EZFS_ISCSISVCUNAVAIL;
-
-               return (zfs_error_fmt(hdl, error,
-                   dgettext(TEXT_DOMAIN, "cannot share '%s'"), dataset));
-       }
-
-       return (0);
-}
-
-int
-zfs_unshare_iscsi(zfs_handle_t *zhp)
-{
-       const char *dataset = zfs_get_name(zhp);
-       libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-       /*
-        * Return if the volume is not shared
-        */
-       if (zfs_is_shared_iscsi(zhp) != SHARED_ISCSI)
-               return (0);
-
-       /*
-        * If this fails with ENODEV it indicates that zvol wasn't shared so
-        * we should return success in that case.
-        */
-       if (iscsitgt_zfs_unshare == NULL ||
-           (iscsitgt_zfs_unshare(dataset) != 0 && errno != ENODEV)) {
-               if (errno == EPERM)
-                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "Insufficient privileges to unshare iscsi"));
-               return (zfs_error_fmt(hdl, EZFS_UNSHAREISCSIFAILED,
-                   dgettext(TEXT_DOMAIN, "cannot unshare '%s'"), dataset));
-       }
-
-       return (0);
-}
-
 typedef struct mount_cbdata {
        zfs_handle_t    **cb_datasets;
        int             cb_used;
@@ -1215,28 +1120,6 @@ out:
        return (ret);
 }
 
-
-static int
-zvol_cb(const char *dataset, void *data)
-{
-       libzfs_handle_t *hdl = data;
-       zfs_handle_t *zhp;
-
-       /*
-        * Ignore snapshots and ignore failures from non-existant datasets.
-        */
-       if (strchr(dataset, '@') != NULL ||
-           (zhp = zfs_open(hdl, dataset, ZFS_TYPE_VOLUME)) == NULL)
-               return (0);
-
-       if (zfs_unshare_iscsi(zhp) != 0)
-               return (-1);
-
-       zfs_close(zhp);
-
-       return (0);
-}
-
 static int
 mountpoint_compare(const void *a, const void *b)
 {
@@ -1246,6 +1129,8 @@ mountpoint_compare(const void *a, const void *b)
        return (strcmp(mountb, mounta));
 }
 
+/* alias for 2002/240 */
+#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 /*
  * Unshare and unmount all datasets within the given pool.  We don't want to
  * rely on traversing the DSL to discover the filesystems within the pool,
@@ -1253,7 +1138,6 @@ mountpoint_compare(const void *a, const void *b)
  * arbitrarily (on I/O error, for example).  Instead, we walk /etc/mnttab and
  * gather all the filesystems that are currently mounted.
  */
-#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 int
 zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 {
@@ -1267,12 +1151,6 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
        int ret = -1;
        int flags = (force ? MS_FORCE : 0);
 
-       /*
-        * First unshare all zvols.
-        */
-       if (zpool_iter_zvol(zhp, zvol_cb, hdl) != 0)
-               return (-1);
-
        namelen = strlen(zhp->zpool_name);
 
        rewind(hdl->libzfs_mnttab);
index fd734d8..7836e58 100644 (file)
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#include <alloca.h>
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <devid.h>
-#include <dirent.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
-#include <zone.h>
 #include <sys/efi_partition.h>
 #include <sys/vtoc.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/zio.h>
-#include <strings.h>
 #include <dlfcn.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
+#include "zfs_comutil.h"
 
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
@@ -193,6 +187,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
        case VDEV_STATE_CANT_OPEN:
                if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
                        return (gettext("FAULTED"));
+               else if (aux == VDEV_AUX_SPLIT_POOL)
+                       return (gettext("SPLIT"));
                else
                        return (gettext("UNAVAIL"));
        case VDEV_STATE_FAULTED:
@@ -273,8 +269,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
 
                switch (prop) {
                case ZPOOL_PROP_SIZE:
-               case ZPOOL_PROP_USED:
-               case ZPOOL_PROP_AVAILABLE:
+               case ZPOOL_PROP_ALLOCATED:
+               case ZPOOL_PROP_FREE:
                        (void) zfs_nicenum(intval, buf, len);
                        break;
 
@@ -283,11 +279,18 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
                            (u_longlong_t)intval);
                        break;
 
+               case ZPOOL_PROP_DEDUPRATIO:
+                       (void) snprintf(buf, len, "%llu.%02llux",
+                           (u_longlong_t)(intval / 100),
+                           (u_longlong_t)(intval % 100));
+                       break;
+
                case ZPOOL_PROP_HEALTH:
                        verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
                            ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
                        verify(nvlist_lookup_uint64_array(nvroot,
-                           ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+                           ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+                           == 0);
 
                        (void) strlcpy(buf, zpool_state_to_name(intval,
                            vs->vs_aux), len);
@@ -1004,9 +1007,6 @@ zpool_destroy(zpool_handle_t *zhp)
            ZFS_TYPE_FILESYSTEM)) == NULL)
                return (-1);
 
-       if (zpool_remove_zvol_links(zhp) != 0)
-               return (-1);
-
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
        if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
@@ -1072,7 +1072,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "device '%s' contains an EFI label and "
                                    "cannot be used on root pools."),
-                                   zpool_vdev_name(hdl, NULL, spares[s]));
+                                   zpool_vdev_name(hdl, NULL, spares[s],
+                                   B_FALSE));
                                return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
                        }
                }
@@ -1167,9 +1168,6 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
        zfs_cmd_t zc = { 0 };
        char msg[1024];
 
-       if (zpool_remove_zvol_links(zhp) != 0)
-               return (-1);
-
        (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
            "cannot export '%s'"), zhp->zpool_name);
 
@@ -1208,6 +1206,127 @@ zpool_export_force(zpool_handle_t *zhp)
        return (zpool_export_common(zhp, B_TRUE, B_TRUE));
 }
 
+static void
+zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
+    nvlist_t *rbi)
+{
+       uint64_t rewindto;
+       int64_t loss = -1;
+       struct tm t;
+       char timestr[128];
+
+       if (!hdl->libzfs_printerr || rbi == NULL)
+               return;
+
+       if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+               return;
+       (void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss);
+
+       if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+           strftime(timestr, 128, 0, &t) != 0) {
+               if (dryrun) {
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "Would be able to return %s "
+                           "to its state as of %s.\n"),
+                           name, timestr);
+               } else {
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "Pool %s returned to its state as of %s.\n"),
+                           name, timestr);
+               }
+               if (loss > 120) {
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "%s approximately %lld "),
+                           dryrun ? "Would discard" : "Discarded",
+                           (loss + 30) / 60);
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "minutes of transactions.\n"));
+               } else if (loss > 0) {
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "%s approximately %lld "),
+                           dryrun ? "Would discard" : "Discarded", loss);
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "seconds of transactions.\n"));
+               }
+       }
+}
+
+void
+zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
+    nvlist_t *config)
+{
+       int64_t loss = -1;
+       uint64_t edata = UINT64_MAX;
+       uint64_t rewindto;
+       struct tm t;
+       char timestr[128];
+
+       if (!hdl->libzfs_printerr)
+               return;
+
+       if (reason >= 0)
+               (void) printf(dgettext(TEXT_DOMAIN, "action: "));
+       else
+               (void) printf(dgettext(TEXT_DOMAIN, "\t"));
+
+       /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
+       if (nvlist_lookup_uint64(config,
+           ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+               goto no_info;
+
+       (void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss);
+       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+           &edata);
+
+       (void) printf(dgettext(TEXT_DOMAIN,
+           "Recovery is possible, but will result in some data loss.\n"));
+
+       if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+           strftime(timestr, 128, 0, &t) != 0) {
+               (void) printf(dgettext(TEXT_DOMAIN,
+                   "\tReturning the pool to its state as of %s\n"
+                   "\tshould correct the problem.  "),
+                   timestr);
+       } else {
+               (void) printf(dgettext(TEXT_DOMAIN,
+                   "\tReverting the pool to an earlier state "
+                   "should correct the problem.\n\t"));
+       }
+
+       if (loss > 120) {
+               (void) printf(dgettext(TEXT_DOMAIN,
+                   "Approximately %lld minutes of data\n"
+                   "\tmust be discarded, irreversibly.  "), (loss + 30) / 60);
+       } else if (loss > 0) {
+               (void) printf(dgettext(TEXT_DOMAIN,
+                   "Approximately %lld seconds of data\n"
+                   "\tmust be discarded, irreversibly.  "), loss);
+       }
+       if (edata != 0 && edata != UINT64_MAX) {
+               if (edata == 1) {
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "After rewind, at least\n"
+                           "\tone persistent user-data error will remain.  "));
+               } else {
+                       (void) printf(dgettext(TEXT_DOMAIN,
+                           "After rewind, several\n"
+                           "\tpersistent user-data errors will remain.  "));
+               }
+       }
+       (void) printf(dgettext(TEXT_DOMAIN,
+           "Recovery can be attempted\n\tby executing 'zpool %s -F %s'.  "),
+           reason >= 0 ? "clear" : "import", name);
+
+       (void) printf(dgettext(TEXT_DOMAIN,
+           "A scrub of the pool\n"
+           "\tis strongly recommended after recovery.\n"));
+       return;
+
+no_info:
+       (void) printf(dgettext(TEXT_DOMAIN,
+           "Destroy and re-create the pool from\n\ta backup source.\n"));
+}
+
 /*
  * zpool_import() is a contracted interface. Should be kept the same
  * if possible.
@@ -1257,8 +1376,11 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
     nvlist_t *props, boolean_t importfaulted)
 {
        zfs_cmd_t zc = { 0 };
+       zpool_rewind_policy_t policy;
+       nvlist_t *nvi = NULL;
        char *thename;
        char *origname;
+       uint64_t returned_size;
        int ret;
        char errbuf[1024];
 
@@ -1302,11 +1424,30 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                nvlist_free(props);
                return (-1);
        }
+       returned_size =  zc.zc_nvlist_conf_size + 512;
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) {
+               nvlist_free(props);
+               return (-1);
+       }
 
        zc.zc_cookie = (uint64_t)importfaulted;
        ret = 0;
        if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
                char desc[1024];
+
+               (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+               zpool_get_rewind_policy(config, &policy);
+               /*
+                * Dry-run failed, but we print out what success
+                * looks like if we found a best txg
+                */
+               if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) {
+                       zpool_rewind_exclaim(hdl, newname ? origname : thename,
+                           B_TRUE, nvi);
+                       nvlist_free(nvi);
+                       return (-1);
+               }
+
                if (newname == NULL)
                        (void) snprintf(desc, sizeof (desc),
                            dgettext(TEXT_DOMAIN, "cannot import '%s'"),
@@ -1328,8 +1469,19 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                        (void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
                        break;
 
+               case EROFS:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "one or more devices is read only"));
+                       (void) zfs_error(hdl, EZFS_BADDEV, desc);
+                       break;
+
                default:
+                       (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
                        (void) zpool_standard_error(hdl, errno, desc);
+                       zpool_explain_recover(hdl,
+                           newname ? origname : thename, -errno, nvi);
+                       nvlist_free(nvi);
+                       break;
                }
 
                ret = -1;
@@ -1339,13 +1491,20 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                /*
                 * This should never fail, but play it safe anyway.
                 */
-               if (zpool_open_silent(hdl, thename, &zhp) != 0) {
+               if (zpool_open_silent(hdl, thename, &zhp) != 0)
                        ret = -1;
-               } else if (zhp != NULL) {
-                       ret = zpool_create_zvol_links(zhp);
+               else if (zhp != NULL)
                        zpool_close(zhp);
+               (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+               zpool_get_rewind_policy(config, &policy);
+               if (policy.zrp_request &
+                   (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+                       zpool_rewind_exclaim(hdl, newname ? origname : thename,
+                           ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+                           nvi);
                }
-
+               nvlist_free(nvi);
+               return (0);
        }
 
        zcmd_free_nvlists(&zc);
@@ -1355,28 +1514,83 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 }
 
 /*
- * Scrub the pool.
+ * Scan the pool.
  */
 int
-zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
+zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
 {
        zfs_cmd_t zc = { 0 };
        char msg[1024];
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-       zc.zc_cookie = type;
+       zc.zc_cookie = func;
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCRUB, &zc) == 0)
+       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+           (errno == ENOENT && func != POOL_SCAN_NONE))
                return (0);
 
-       (void) snprintf(msg, sizeof (msg),
-           dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+       if (func == POOL_SCAN_SCRUB) {
+               (void) snprintf(msg, sizeof (msg),
+                   dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+       } else if (func == POOL_SCAN_NONE) {
+               (void) snprintf(msg, sizeof (msg),
+                   dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
+                   zc.zc_name);
+       } else {
+               assert(!"unexpected result");
+       }
 
-       if (errno == EBUSY)
-               return (zfs_error(hdl, EZFS_RESILVERING, msg));
-       else
+       if (errno == EBUSY) {
+               nvlist_t *nvroot;
+               pool_scan_stat_t *ps = NULL;
+               uint_t psc;
+
+               verify(nvlist_lookup_nvlist(zhp->zpool_config,
+                   ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+               (void) nvlist_lookup_uint64_array(nvroot,
+                   ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
+               if (ps && ps->pss_func == POOL_SCAN_SCRUB)
+                       return (zfs_error(hdl, EZFS_SCRUBBING, msg));
+               else
+                       return (zfs_error(hdl, EZFS_RESILVERING, msg));
+       } else if (errno == ENOENT) {
+               return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
+       } else {
                return (zpool_standard_error(hdl, errno, msg));
+       }
+}
+
+/*
+ * This provides a very minimal check whether a given string is likely a
+ * c#t#d# style string.  Users of this are expected to do their own
+ * verification of the s# part.
+ */
+#define        CTD_CHECK(str)  (str && str[0] == 'c' && isdigit(str[1]))
+
+/*
+ * More elaborate version for ones which may start with "/dev/dsk/"
+ * and the like.
+ */
+static int
+ctd_check_path(char *str) {
+       /*
+        * If it starts with a slash, check the last component.
+        */
+       if (str && str[0] == '/') {
+               char *tmp = strrchr(str, '/');
+
+               /*
+                * If it ends in "/old", check the second-to-last
+                * component of the string instead.
+                */
+               if (tmp != str && strcmp(tmp, "/old") == 0) {
+                       for (tmp--; *tmp != '/'; tmp--)
+                               ;
+               }
+               str = tmp + 1;
+       }
+       return (CTD_CHECK(str));
 }
 
 /*
@@ -1433,25 +1647,99 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
                        break;
 
                /*
-                * Search for the requested value. We special case the search
-                * for ZPOOL_CONFIG_PATH when it's a wholedisk. Otherwise,
-                * all other searches are simple string compares.
+                * Search for the requested value. Special cases:
+                *
+                * - ZPOOL_CONFIG_PATH for whole disk entries.  These end in
+                *   "s0" or "s0/old".  The "s0" part is hidden from the user,
+                *   but included in the string, so this matches around it.
+                * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+                *
+                * Otherwise, all other searches are simple string compares.
                 */
-               if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) {
+               if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
+                   ctd_check_path(val)) {
                        uint64_t wholedisk = 0;
 
                        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
                            &wholedisk);
                        if (wholedisk) {
+                               int slen = strlen(srchval);
+                               int vlen = strlen(val);
+
+                               if (slen != vlen - 2)
+                                       break;
+
+                               /*
+                                * make_leaf_vdev() should only set
+                                * wholedisk for ZPOOL_CONFIG_PATHs which
+                                * will include "/dev/dsk/", giving plenty of
+                                * room for the indices used next.
+                                */
+                               ASSERT(vlen >= 6);
+
                                /*
-                                * For whole disks, the internal path has 's0',
-                                * but the path passed in by the user doesn't.
+                                * strings identical except trailing "s0"
                                 */
-                               if (strlen(srchval) == strlen(val) - 2 &&
-                                   strncmp(srchval, val, strlen(srchval)) == 0)
+                               if (strcmp(&val[vlen - 2], "s0") == 0 &&
+                                   strncmp(srchval, val, slen) == 0)
                                        return (nv);
+
+                               /*
+                                * strings identical except trailing "s0/old"
+                                */
+                               if (strcmp(&val[vlen - 6], "s0/old") == 0 &&
+                                   strcmp(&srchval[slen - 4], "/old") == 0 &&
+                                   strncmp(srchval, val, slen - 4) == 0)
+                                       return (nv);
+
                                break;
                        }
+               } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+                       char *type, *idx, *end, *p;
+                       uint64_t id, vdev_id;
+
+                       /*
+                        * Determine our vdev type, keeping in mind
+                        * that the srchval is composed of a type and
+                        * vdev id pair (i.e. mirror-4).
+                        */
+                       if ((type = strdup(srchval)) == NULL)
+                               return (NULL);
+
+                       if ((p = strrchr(type, '-')) == NULL) {
+                               free(type);
+                               break;
+                       }
+                       idx = p + 1;
+                       *p = '\0';
+
+                       /*
+                        * If the types don't match then keep looking.
+                        */
+                       if (strncmp(val, type, strlen(val)) != 0) {
+                               free(type);
+                               break;
+                       }
+
+                       verify(strncmp(type, VDEV_TYPE_RAIDZ,
+                           strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+                           strncmp(type, VDEV_TYPE_MIRROR,
+                           strlen(VDEV_TYPE_MIRROR)) == 0);
+                       verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+                           &id) == 0);
+
+                       errno = 0;
+                       vdev_id = strtoull(idx, &end, 10);
+
+                       free(type);
+                       if (errno != 0)
+                               return (NULL);
+
+                       /*
+                        * Now verify that we have the correct vdev id.
+                        */
+                       if (vdev_id == id)
+                               return (nv);
                }
 
                /*
@@ -1537,6 +1825,18 @@ zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
        return (ret);
 }
 
+/*
+ * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
+ */
+boolean_t
+zpool_vdev_is_interior(const char *name)
+{
+       if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+           strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
+               return (B_TRUE);
+       return (B_FALSE);
+}
+
 nvlist_t *
 zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
     boolean_t *l2cache, boolean_t *log)
@@ -1551,6 +1851,8 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
        guid = strtoull(path, &end, 10);
        if (guid != 0 && *end == '\0') {
                verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
+       } else if (zpool_vdev_is_interior(path)) {
+               verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
        } else if (path[0] != '/') {
                (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
                verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
@@ -1721,34 +2023,6 @@ zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
 }
 
 /*
- * Returns TRUE if the given guid corresponds to the given type.
- * This is used to check for hot spares (INUSE or not), and level 2 cache
- * devices.
- */
-static boolean_t
-is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type)
-{
-       uint64_t target_guid;
-       nvlist_t *nvroot;
-       nvlist_t **list;
-       uint_t count;
-       int i;
-
-       verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
-           &nvroot) == 0);
-       if (nvlist_lookup_nvlist_array(nvroot, type, &list, &count) == 0) {
-               for (i = 0; i < count; i++) {
-                       verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID,
-                           &target_guid) == 0);
-                       if (guid == target_guid)
-                               return (B_TRUE);
-               }
-       }
-
-       return (B_FALSE);
-}
-
-/*
  * If the device has being dynamically expanded then we need to relabel
  * the disk to use the new unallocated space.
  */
@@ -1816,8 +2090,7 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
 
        verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
 
-       if (avail_spare ||
-           is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
+       if (avail_spare)
                return (zfs_error(hdl, EZFS_ISSPARE, msg));
 
        if (flags & ZFS_ONLINE_EXPAND ||
@@ -1848,8 +2121,15 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
        zc.zc_cookie = VDEV_STATE_ONLINE;
        zc.zc_obj = flags;
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0)
+       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+               if (errno == EINVAL) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
+                           "from this pool into a new one.  Use '%s' "
+                           "instead"), "zpool detach");
+                       return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
+               }
                return (zpool_standard_error(hdl, errno, msg));
+       }
 
        *newstate = zc.zc_cookie;
        return (0);
@@ -1877,8 +2157,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
 
        verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
 
-       if (avail_spare ||
-           is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
+       if (avail_spare)
                return (zfs_error(hdl, EZFS_ISSPARE, msg));
 
        zc.zc_cookie = VDEV_STATE_OFFLINE;
@@ -1910,7 +2189,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
  * Mark the given vdev faulted.
  */
 int
-zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
+zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 {
        zfs_cmd_t zc = { 0 };
        char msg[1024];
@@ -1922,6 +2201,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        zc.zc_guid = guid;
        zc.zc_cookie = VDEV_STATE_FAULTED;
+       zc.zc_obj = aux;
 
        if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
                return (0);
@@ -1944,7 +2224,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
  * Mark the given vdev degraded.
  */
 int
-zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid)
+zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 {
        zfs_cmd_t zc = { 0 };
        char msg[1024];
@@ -1956,6 +2236,7 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid)
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        zc.zc_guid = guid;
        zc.zc_cookie = VDEV_STATE_DEGRADED;
+       zc.zc_obj = aux;
 
        if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
                return (0);
@@ -2053,7 +2334,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
        verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
            ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
 
-       if ((newname = zpool_vdev_name(NULL, NULL, child[0])) == NULL)
+       if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
                return (-1);
 
        /*
@@ -2241,6 +2522,257 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
 }
 
 /*
+ * Find a mirror vdev in the source nvlist.
+ *
+ * The mchild array contains a list of disks in one of the top-level mirrors
+ * of the source pool.  The schild array contains a list of disks that the
+ * user specified on the command line.  We loop over the mchild array to
+ * see if any entry in the schild array matches.
+ *
+ * If a disk in the mchild array is found in the schild array, we return
+ * the index of that entry.  Otherwise we return -1.
+ */
+static int
+find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
+    nvlist_t **schild, uint_t schildren)
+{
+       uint_t mc;
+
+       for (mc = 0; mc < mchildren; mc++) {
+               uint_t sc;
+               char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+                   mchild[mc], B_FALSE);
+
+               for (sc = 0; sc < schildren; sc++) {
+                       char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+                           schild[sc], B_FALSE);
+                       boolean_t result = (strcmp(mpath, spath) == 0);
+
+                       free(spath);
+                       if (result) {
+                               free(mpath);
+                               return (mc);
+                       }
+               }
+
+               free(mpath);
+       }
+
+       return (-1);
+}
+
+/*
+ * Split a mirror pool.  If newroot points to null, then a new nvlist
+ * is generated and it is the responsibility of the caller to free it.
+ */
+int
+zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
+    nvlist_t *props, splitflags_t flags)
+{
+       zfs_cmd_t zc = { 0 };
+       char msg[1024];
+       nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
+       nvlist_t **varray = NULL, *zc_props = NULL;
+       uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
+       libzfs_handle_t *hdl = zhp->zpool_hdl;
+       uint64_t vers;
+       boolean_t freelist = B_FALSE, memory_err = B_TRUE;
+       int retval = 0;
+
+       (void) snprintf(msg, sizeof (msg),
+           dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
+
+       if (!zpool_name_valid(hdl, B_FALSE, newname))
+               return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
+
+       if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+               (void) fprintf(stderr, gettext("Internal error: unable to "
+                   "retrieve pool configuration\n"));
+               return (-1);
+       }
+
+       verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
+           == 0);
+       verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
+
+       if (props) {
+               if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
+                   props, vers, B_TRUE, msg)) == NULL)
+                       return (-1);
+       }
+
+       if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+           &children) != 0) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "Source pool is missing vdev tree"));
+               if (zc_props)
+                       nvlist_free(zc_props);
+               return (-1);
+       }
+
+       varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
+       vcount = 0;
+
+       if (*newroot == NULL ||
+           nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
+           &newchild, &newchildren) != 0)
+               newchildren = 0;
+
+       for (c = 0; c < children; c++) {
+               uint64_t is_log = B_FALSE, is_hole = B_FALSE;
+               char *type;
+               nvlist_t **mchild, *vdev;
+               uint_t mchildren;
+               int entry;
+
+               /*
+                * Unlike cache & spares, slogs are stored in the
+                * ZPOOL_CONFIG_CHILDREN array.  We filter them out here.
+                */
+               (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+                   &is_log);
+               (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+                   &is_hole);
+               if (is_log || is_hole) {
+                       /*
+                        * Create a hole vdev and put it in the config.
+                        */
+                       if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
+                               goto out;
+                       if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
+                           VDEV_TYPE_HOLE) != 0)
+                               goto out;
+                       if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
+                           1) != 0)
+                               goto out;
+                       if (lastlog == 0)
+                               lastlog = vcount;
+                       varray[vcount++] = vdev;
+                       continue;
+               }
+               lastlog = 0;
+               verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
+                   == 0);
+               if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "Source pool must be composed only of mirrors\n"));
+                       retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+                       goto out;
+               }
+
+               verify(nvlist_lookup_nvlist_array(child[c],
+                   ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+
+               /* find or add an entry for this top-level vdev */
+               if (newchildren > 0 &&
+                   (entry = find_vdev_entry(zhp, mchild, mchildren,
+                   newchild, newchildren)) >= 0) {
+                       /* We found a disk that the user specified. */
+                       vdev = mchild[entry];
+                       ++found;
+               } else {
+                       /* User didn't specify a disk for this vdev. */
+                       vdev = mchild[mchildren - 1];
+               }
+
+               if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
+                       goto out;
+       }
+
+       /* did we find every disk the user specified? */
+       if (found != newchildren) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
+                   "include at most one disk from each mirror"));
+               retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+               goto out;
+       }
+
+       /* Prepare the nvlist for populating. */
+       if (*newroot == NULL) {
+               if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
+                       goto out;
+               freelist = B_TRUE;
+               if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
+                   VDEV_TYPE_ROOT) != 0)
+                       goto out;
+       } else {
+               verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
+       }
+
+       /* Add all the children we found */
+       if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
+           lastlog == 0 ? vcount : lastlog) != 0)
+               goto out;
+
+       /*
+        * If we're just doing a dry run, exit now with success.
+        */
+       if (flags.dryrun) {
+               memory_err = B_FALSE;
+               freelist = B_FALSE;
+               goto out;
+       }
+
+       /* now build up the config list & call the ioctl */
+       if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
+               goto out;
+
+       if (nvlist_add_nvlist(newconfig,
+           ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
+           nvlist_add_string(newconfig,
+           ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
+           nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
+               goto out;
+
+       /*
+        * The new pool is automatically part of the namespace unless we
+        * explicitly export it.
+        */
+       if (!flags.import)
+               zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
+       (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+       (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
+       if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
+               goto out;
+       if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
+               goto out;
+
+       if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
+               retval = zpool_standard_error(hdl, errno, msg);
+               goto out;
+       }
+
+       freelist = B_FALSE;
+       memory_err = B_FALSE;
+
+out:
+       if (varray != NULL) {
+               int v;
+
+               for (v = 0; v < vcount; v++)
+                       nvlist_free(varray[v]);
+               free(varray);
+       }
+       zcmd_free_nvlists(&zc);
+       if (zc_props)
+               nvlist_free(zc_props);
+       if (newconfig)
+               nvlist_free(newconfig);
+       if (freelist) {
+               nvlist_free(*newroot);
+               *newroot = NULL;
+       }
+
+       if (retval != 0)
+               return (retval);
+
+       if (memory_err)
+               return (no_memory(hdl));
+
+       return (0);
+}
+
+/*
  * Remove the given device.  Currently, this is supported only for hot spares
  * and level 2 cache devices.
  */
@@ -2250,24 +2782,34 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
        zfs_cmd_t zc = { 0 };
        char msg[1024];
        nvlist_t *tgt;
-       boolean_t avail_spare, l2cache;
+       boolean_t avail_spare, l2cache, islog;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
+       uint64_t version;
 
        (void) snprintf(msg, sizeof (msg),
            dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
 
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
-           NULL)) == 0)
+           &islog)) == 0)
                return (zfs_error(hdl, EZFS_NODEVICE, msg));
-
-       if (!avail_spare && !l2cache) {
+       /*
+        * XXX - this should just go away.
+        */
+       if (!avail_spare && !l2cache && !islog) {
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                   "only inactive hot spares or cache devices "
-                   "can be removed"));
+                   "only inactive hot spares, cache, top-level, "
+                   "or log devices can be removed"));
                return (zfs_error(hdl, EZFS_NODEVICE, msg));
        }
 
+       version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+       if (islog && version < SPA_VERSION_HOLES) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "pool must be upgrade to support log removal"));
+               return (zfs_error(hdl, EZFS_BADVERSION, msg));
+       }
+
        verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
 
        if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
@@ -2280,13 +2822,15 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
  * Clear the errors for the pool, or the particular device if specified.
  */
 int
-zpool_clear(zpool_handle_t *zhp, const char *path)
+zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
 {
        zfs_cmd_t zc = { 0 };
        char msg[1024];
        nvlist_t *tgt;
+       zpool_rewind_policy_t policy;
        boolean_t avail_spare, l2cache;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
+       nvlist_t *nvi = NULL;
 
        if (path)
                (void) snprintf(msg, sizeof (msg),
@@ -2314,9 +2858,31 @@ zpool_clear(zpool_handle_t *zhp, const char *path)
                    &zc.zc_guid) == 0);
        }
 
-       if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0)
+       zpool_get_rewind_policy(rewindnvl, &policy);
+       zc.zc_cookie = policy.zrp_request;
+
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0)
+               return (-1);
+
+       if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
+               return (-1);
+
+       if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 ||
+           ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+           errno != EPERM && errno != EACCES)) {
+               if (policy.zrp_request &
+                   (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+                       (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+                       zpool_rewind_exclaim(hdl, zc.zc_name,
+                           ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+                           nvi);
+                       nvlist_free(nvi);
+               }
+               zcmd_free_nvlists(&zc);
                return (0);
+       }
 
+       zcmd_free_nvlists(&zc);
        return (zpool_standard_error(hdl, errno, msg));
 }
 
@@ -2336,6 +2902,7 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
 
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        zc.zc_guid = guid;
+       zc.zc_cookie = ZPOOL_NO_REWIND;
 
        if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
                return (0);
@@ -2344,173 +2911,6 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
 }
 
 /*
- * Iterate over all zvols in a given pool by walking the /dev/zvol/dsk/<pool>
- * hierarchy.
- */
-int
-zpool_iter_zvol(zpool_handle_t *zhp, int (*cb)(const char *, void *),
-    void *data)
-{
-       libzfs_handle_t *hdl = zhp->zpool_hdl;
-       char (*paths)[MAXPATHLEN];
-       size_t size = 4;
-       int curr, fd, base, ret = 0;
-       DIR *dirp;
-       struct dirent *dp;
-       struct stat st;
-
-       if ((base = open("/dev/zvol/dsk", O_RDONLY)) < 0)
-               return (errno == ENOENT ? 0 : -1);
-
-       if (fstatat(base, zhp->zpool_name, &st, 0) != 0) {
-               int err = errno;
-               (void) close(base);
-               return (err == ENOENT ? 0 : -1);
-       }
-
-       /*
-        * Oddly this wasn't a directory -- ignore that failure since we
-        * know there are no links lower in the (non-existant) hierarchy.
-        */
-       if (!S_ISDIR(st.st_mode)) {
-               (void) close(base);
-               return (0);
-       }
-
-       if ((paths = zfs_alloc(hdl, size * sizeof (paths[0]))) == NULL) {
-               (void) close(base);
-               return (-1);
-       }
-
-       (void) strlcpy(paths[0], zhp->zpool_name, sizeof (paths[0]));
-       curr = 0;
-
-       while (curr >= 0) {
-               if (fstatat(base, paths[curr], &st, AT_SYMLINK_NOFOLLOW) != 0)
-                       goto err;
-
-               if (S_ISDIR(st.st_mode)) {
-                       if ((fd = openat(base, paths[curr], O_RDONLY)) < 0)
-                               goto err;
-
-                       if ((dirp = fdopendir(fd)) == NULL) {
-                               (void) close(fd);
-                               goto err;
-                       }
-
-                       while ((dp = readdir(dirp)) != NULL) {
-                               if (dp->d_name[0] == '.')
-                                       continue;
-
-                               if (curr + 1 == size) {
-                                       paths = zfs_realloc(hdl, paths,
-                                           size * sizeof (paths[0]),
-                                           size * 2 * sizeof (paths[0]));
-                                       if (paths == NULL) {
-                                               (void) closedir(dirp);
-                                               (void) close(fd);
-                                               goto err;
-                                       }
-
-                                       size *= 2;
-                               }
-
-                               (void) strlcpy(paths[curr + 1], paths[curr],
-                                   sizeof (paths[curr + 1]));
-                               (void) strlcat(paths[curr], "/",
-                                   sizeof (paths[curr]));
-                               (void) strlcat(paths[curr], dp->d_name,
-                                   sizeof (paths[curr]));
-                               curr++;
-                       }
-
-                       (void) closedir(dirp);
-
-               } else {
-                       if ((ret = cb(paths[curr], data)) != 0)
-                               break;
-               }
-
-               curr--;
-       }
-
-       free(paths);
-       (void) close(base);
-
-       return (ret);
-
-err:
-       free(paths);
-       (void) close(base);
-       return (-1);
-}
-
-typedef struct zvol_cb {
-       zpool_handle_t *zcb_pool;
-       boolean_t zcb_create;
-} zvol_cb_t;
-
-/*ARGSUSED*/
-static int
-do_zvol_create(zfs_handle_t *zhp, void *data)
-{
-       int ret = 0;
-
-       if (ZFS_IS_VOLUME(zhp)) {
-               (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
-               ret = zfs_iter_snapshots(zhp, do_zvol_create, NULL);
-       }
-
-       if (ret == 0)
-               ret = zfs_iter_filesystems(zhp, do_zvol_create, NULL);
-
-       zfs_close(zhp);
-
-       return (ret);
-}
-
-/*
- * Iterate over all zvols in the pool and make any necessary minor nodes.
- */
-int
-zpool_create_zvol_links(zpool_handle_t *zhp)
-{
-       zfs_handle_t *zfp;
-       int ret;
-
-       /*
-        * If the pool is unavailable, just return success.
-        */
-       if ((zfp = make_dataset_handle(zhp->zpool_hdl,
-           zhp->zpool_name)) == NULL)
-               return (0);
-
-       ret = zfs_iter_filesystems(zfp, do_zvol_create, NULL);
-
-       zfs_close(zfp);
-       return (ret);
-}
-
-static int
-do_zvol_remove(const char *dataset, void *data)
-{
-       zpool_handle_t *zhp = data;
-
-       return (zvol_remove_link(zhp->zpool_hdl, dataset));
-}
-
-/*
- * Iterate over all zvols in the pool and remove any minor nodes.  We iterate
- * by examining the /dev links so that a corrupted pool doesn't impede this
- * operation.
- */
-int
-zpool_remove_zvol_links(zpool_handle_t *zhp)
-{
-       return (zpool_iter_zvol(zhp, do_zvol_remove, zhp));
-}
-
-/*
  * Convert from a devid string to a path.
  */
 static char *
@@ -2602,7 +3002,8 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
  * of these checks.
  */
 char *
-zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
+zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
+    boolean_t verbose)
 {
        char *path, *devid;
        uint64_t value;
@@ -2625,7 +3026,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
                 * open a misbehaving device, which can have undesirable
                 * effects.
                 */
-               if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+               if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
                    (uint64_t **)&vs, &vsc) != 0 ||
                    vs->vs_state >= VDEV_STATE_DEGRADED) &&
                    zhp != NULL &&
@@ -2662,10 +3063,23 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
 
                if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
                    &value) == 0 && value) {
+                       int pathlen = strlen(path);
                        char *tmp = zfs_strdup(hdl, path);
-                       if (tmp == NULL)
-                               return (NULL);
-                       tmp[strlen(path) - 2] = '\0';
+
+                       /*
+                        * If it starts with c#, and ends with "s0", chop
+                        * the "s0" off, or if it ends with "s0/old", remove
+                        * the "s0" from the middle.
+                        */
+                       if (CTD_CHECK(tmp)) {
+                               if (strcmp(&tmp[pathlen - 2], "s0") == 0) {
+                                       tmp[pathlen - 2] = '\0';
+                               } else if (pathlen > 6 &&
+                                   strcmp(&tmp[pathlen - 6], "s0/old") == 0) {
+                                       (void) strcpy(&tmp[pathlen - 6],
+                                           "/old");
+                               }
+                       }
                        return (tmp);
                }
        } else {
@@ -2681,6 +3095,20 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
                            (u_longlong_t)value);
                        path = buf;
                }
+
+               /*
+                * We identify each top-level vdev by using a <type-id>
+                * naming convention.
+                */
+               if (verbose) {
+                       uint64_t id;
+
+                       verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+                           &id) == 0);
+                       (void) snprintf(buf, sizeof (buf), "%s-%llu", path,
+                           (u_longlong_t)id);
+                       path = buf;
+               }
        }
 
        return (zfs_strdup(hdl, path));
@@ -2899,7 +3327,7 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
  * into 'records'.  'leftover' is set to the number of bytes that weren't
  * processed as there wasn't a complete record.
  */
-static int
+int
 zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
     nvlist_t ***records, uint_t *numrecords)
 {
@@ -3218,6 +3646,7 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
        if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
            strcmp(type, VDEV_TYPE_FILE) == 0 ||
            strcmp(type, VDEV_TYPE_LOG) == 0 ||
+           strcmp(type, VDEV_TYPE_HOLE) == 0 ||
            strcmp(type, VDEV_TYPE_MISSING) == 0) {
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                    "vdev type '%s' is not supported"), type);
index 1ffb629..672e004 100644 (file)
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <libdevinfo.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <fcntl.h>
 #include <sys/mount.h>
-#include <sys/mntent.h>
-#include <sys/mnttab.h>
-#include <sys/avl.h>
-#include <stddef.h>
+#include <pthread.h>
+#include <umem.h>
 
 #include <libzfs.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
+#include "zfs_fletcher.h"
 #include "libzfs_impl.h"
+#include <sha2.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
 
-#include <fletcher.c> /* XXX */
+/* in libzfs_dataset.c */
+extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 
 static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t,
-    int, avl_tree_t *, char **);
+    int, const char *, nvlist_t *, avl_tree_t *, char **);
+
+static const zio_cksum_t zero_cksum = { 0 };
+
+typedef struct dedup_arg {
+       int     inputfd;
+       int     outputfd;
+       libzfs_handle_t  *dedup_hdl;
+} dedup_arg_t;
+
+typedef struct dataref {
+       uint64_t ref_guid;
+       uint64_t ref_object;
+       uint64_t ref_offset;
+} dataref_t;
+
+typedef struct dedup_entry {
+       struct dedup_entry      *dde_next;
+       zio_cksum_t dde_chksum;
+       uint64_t dde_prop;
+       dataref_t dde_ref;
+} dedup_entry_t;
+
+#define        MAX_DDT_PHYSMEM_PERCENT         20
+#define        SMALLEST_POSSIBLE_MAX_DDT_MB            128
+
+typedef struct dedup_table {
+       dedup_entry_t   **dedup_hash_array;
+       umem_cache_t    *ddecache;
+       uint64_t        max_ddt_size;  /* max dedup table size in bytes */
+       uint64_t        cur_ddt_size;  /* current dedup table size in bytes */
+       uint64_t        ddt_count;
+       int             numhashbits;
+       boolean_t       ddt_full;
+} dedup_table_t;
+
+static int
+high_order_bit(uint64_t n)
+{
+       int count;
+
+       for (count = 0; n != 0; count++)
+               n >>= 1;
+       return (count);
+}
+
+static size_t
+ssread(void *buf, size_t len, FILE *stream)
+{
+       size_t outlen;
+
+       if ((outlen = fread(buf, len, 1, stream)) == 0)
+               return (0);
+
+       return (outlen);
+}
+
+static void
+ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
+    zio_cksum_t *cs, uint64_t prop, dataref_t *dr)
+{
+       dedup_entry_t   *dde;
+
+       if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
+               if (ddt->ddt_full == B_FALSE) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "Dedup table full.  Deduplication will continue "
+                           "with existing table entries"));
+                       ddt->ddt_full = B_TRUE;
+               }
+               return;
+       }
+
+       if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
+           != NULL) {
+               assert(*ddepp == NULL);
+               dde->dde_next = NULL;
+               dde->dde_chksum = *cs;
+               dde->dde_prop = prop;
+               dde->dde_ref = *dr;
+               *ddepp = dde;
+               ddt->cur_ddt_size += sizeof (dedup_entry_t);
+               ddt->ddt_count++;
+       }
+}
+
+/*
+ * Using the specified dedup table, do a lookup for an entry with
+ * the checksum cs.  If found, return the block's reference info
+ * in *dr. Otherwise, insert a new entry in the dedup table, using
+ * the reference information specified by *dr.
+ *
+ * return value:  true - entry was found
+ *               false - entry was not found
+ */
+static boolean_t
+ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
+    uint64_t prop, dataref_t *dr)
+{
+       uint32_t hashcode;
+       dedup_entry_t **ddepp;
+
+       hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
+
+       for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
+           ddepp = &((*ddepp)->dde_next)) {
+               if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) &&
+                   (*ddepp)->dde_prop == prop) {
+                       *dr = (*ddepp)->dde_ref;
+                       return (B_TRUE);
+               }
+       }
+       ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
+       return (B_FALSE);
+}
+
+static int
+cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd)
+{
+       fletcher_4_incremental_native(buf, len, zc);
+       return (write(outfd, buf, len));
+}
+
+/*
+ * This function is started in a separate thread when the dedup option
+ * has been requested.  The main send thread determines the list of
+ * snapshots to be included in the send stream and makes the ioctl calls
+ * for each one.  But instead of having the ioctl send the output to the
+ * the output fd specified by the caller of zfs_send()), the
+ * ioctl is told to direct the output to a pipe, which is read by the
+ * alternate thread running THIS function.  This function does the
+ * dedup'ing by:
+ *  1. building a dedup table (the DDT)
+ *  2. doing checksums on each data block and inserting a record in the DDT
+ *  3. looking for matching checksums, and
+ *  4.  sending a DRR_WRITE_BYREF record instead of a write record whenever
+ *      a duplicate block is found.
+ * The output of this function then goes to the output fd requested
+ * by the caller of zfs_send().
+ */
+static void *
+cksummer(void *arg)
+{
+       dedup_arg_t *dda = arg;
+       char *buf = malloc(1<<20);
+       dmu_replay_record_t thedrr;
+       dmu_replay_record_t *drr = &thedrr;
+       struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
+       struct drr_end *drre = &thedrr.drr_u.drr_end;
+       struct drr_object *drro = &thedrr.drr_u.drr_object;
+       struct drr_write *drrw = &thedrr.drr_u.drr_write;
+       struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+       FILE *ofp;
+       int outfd;
+       dmu_replay_record_t wbr_drr = {0};
+       struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
+       dedup_table_t ddt;
+       zio_cksum_t stream_cksum;
+       uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
+       uint64_t numbuckets;
+
+       ddt.max_ddt_size =
+           MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100,
+           SMALLEST_POSSIBLE_MAX_DDT_MB<<20);
+
+       numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t));
+
+       /*
+        * numbuckets must be a power of 2.  Increase number to
+        * a power of 2 if necessary.
+        */
+       if (!ISP2(numbuckets))
+               numbuckets = 1 << high_order_bit(numbuckets);
+
+       ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
+       ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
+           NULL, NULL, NULL, NULL, NULL, 0);
+       ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
+       ddt.numhashbits = high_order_bit(numbuckets) - 1;
+       ddt.ddt_full = B_FALSE;
+
+       /* Initialize the write-by-reference block. */
+       wbr_drr.drr_type = DRR_WRITE_BYREF;
+       wbr_drr.drr_payloadlen = 0;
+
+       outfd = dda->outputfd;
+       ofp = fdopen(dda->inputfd, "r");
+       while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) {
+
+               switch (drr->drr_type) {
+               case DRR_BEGIN:
+               {
+                       int     fflags;
+                       ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+
+                       /* set the DEDUP feature flag for this stream */
+                       fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+                       fflags |= (DMU_BACKUP_FEATURE_DEDUP |
+                           DMU_BACKUP_FEATURE_DEDUPPROPS);
+                       DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
+
+                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+                           DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
+                               int sz = drr->drr_payloadlen;
+
+                               if (sz > 1<<20) {
+                                       free(buf);
+                                       buf = malloc(sz);
+                               }
+                               (void) ssread(buf, sz, ofp);
+                               if (ferror(stdin))
+                                       perror("fread");
+                               if (cksum_and_write(buf, sz, &stream_cksum,
+                                   outfd) == -1)
+                                       goto out;
+                       }
+                       break;
+               }
+
+               case DRR_END:
+               {
+                       /* use the recalculated checksum */
+                       ZIO_SET_CHECKSUM(&drre->drr_checksum,
+                           stream_cksum.zc_word[0], stream_cksum.zc_word[1],
+                           stream_cksum.zc_word[2], stream_cksum.zc_word[3]);
+                       if ((write(outfd, drr,
+                           sizeof (dmu_replay_record_t))) == -1)
+                               goto out;
+                       break;
+               }
+
+               case DRR_OBJECT:
+               {
+                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       if (drro->drr_bonuslen > 0) {
+                               (void) ssread(buf,
+                                   P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+                                   ofp);
+                               if (cksum_and_write(buf,
+                                   P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+                                   &stream_cksum, outfd) == -1)
+                                       goto out;
+                       }
+                       break;
+               }
+
+               case DRR_SPILL:
+               {
+                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       (void) ssread(buf, drrs->drr_length, ofp);
+                       if (cksum_and_write(buf, drrs->drr_length,
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       break;
+               }
+
+               case DRR_FREEOBJECTS:
+               {
+                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       break;
+               }
+
+               case DRR_WRITE:
+               {
+                       dataref_t       dataref;
+
+                       (void) ssread(buf, drrw->drr_length, ofp);
+
+                       /*
+                        * Use the existing checksum if it's dedup-capable,
+                        * else calculate a SHA256 checksum for it.
+                        */
+
+                       if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
+                           zero_cksum) ||
+                           !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
+                               SHA256_CTX      ctx;
+                               zio_cksum_t     tmpsha256;
+
+                               SHA256Init(&ctx);
+                               SHA256Update(&ctx, buf, drrw->drr_length);
+                               SHA256Final(&tmpsha256, &ctx);
+                               drrw->drr_key.ddk_cksum.zc_word[0] =
+                                   BE_64(tmpsha256.zc_word[0]);
+                               drrw->drr_key.ddk_cksum.zc_word[1] =
+                                   BE_64(tmpsha256.zc_word[1]);
+                               drrw->drr_key.ddk_cksum.zc_word[2] =
+                                   BE_64(tmpsha256.zc_word[2]);
+                               drrw->drr_key.ddk_cksum.zc_word[3] =
+                                   BE_64(tmpsha256.zc_word[3]);
+                               drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
+                               drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
+                       }
+
+                       dataref.ref_guid = drrw->drr_toguid;
+                       dataref.ref_object = drrw->drr_object;
+                       dataref.ref_offset = drrw->drr_offset;
+
+                       if (ddt_update(dda->dedup_hdl, &ddt,
+                           &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
+                           &dataref)) {
+                               /* block already present in stream */
+                               wbr_drrr->drr_object = drrw->drr_object;
+                               wbr_drrr->drr_offset = drrw->drr_offset;
+                               wbr_drrr->drr_length = drrw->drr_length;
+                               wbr_drrr->drr_toguid = drrw->drr_toguid;
+                               wbr_drrr->drr_refguid = dataref.ref_guid;
+                               wbr_drrr->drr_refobject =
+                                   dataref.ref_object;
+                               wbr_drrr->drr_refoffset =
+                                   dataref.ref_offset;
+
+                               wbr_drrr->drr_checksumtype =
+                                   drrw->drr_checksumtype;
+                               wbr_drrr->drr_checksumflags =
+                                   drrw->drr_checksumtype;
+                               wbr_drrr->drr_key.ddk_cksum =
+                                   drrw->drr_key.ddk_cksum;
+                               wbr_drrr->drr_key.ddk_prop =
+                                   drrw->drr_key.ddk_prop;
+
+                               if (cksum_and_write(&wbr_drr,
+                                   sizeof (dmu_replay_record_t), &stream_cksum,
+                                   outfd) == -1)
+                                       goto out;
+                       } else {
+                               /* block not previously seen */
+                               if (cksum_and_write(drr,
+                                   sizeof (dmu_replay_record_t), &stream_cksum,
+                                   outfd) == -1)
+                                       goto out;
+                               if (cksum_and_write(buf,
+                                   drrw->drr_length,
+                                   &stream_cksum, outfd) == -1)
+                                       goto out;
+                       }
+                       break;
+               }
+
+               case DRR_FREE:
+               {
+                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       break;
+               }
+
+               default:
+                       (void) printf("INVALID record type 0x%x\n",
+                           drr->drr_type);
+                       /* should never happen, so assert */
+                       assert(B_FALSE);
+               }
+       }
+out:
+       umem_cache_destroy(ddt.ddecache);
+       free(ddt.dedup_hash_array);
+       free(buf);
+       (void) fclose(ofp);
+
+       return (NULL);
+}
 
 /*
  * Routines for dealing with the AVL tree of fs-nvlists
@@ -173,6 +544,7 @@ typedef struct send_data {
        nvlist_t *snapprops;
        const char *fromsnap;
        const char *tosnap;
+       boolean_t recursive;
 
        /*
         * The header nvlist is of the following format:
@@ -240,27 +612,50 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
                zfs_prop_t prop = zfs_name_to_prop(propname);
                nvlist_t *propnv;
 
-               assert(zfs_prop_user(propname) || prop != ZPROP_INVAL);
+               if (!zfs_prop_user(propname)) {
+                       /*
+                        * Realistically, this should never happen.  However,
+                        * we want the ability to add DSL properties without
+                        * needing to make incompatible version changes.  We
+                        * need to ignore unknown properties to allow older
+                        * software to still send datasets containing these
+                        * properties, with the unknown properties elided.
+                        */
+                       if (prop == ZPROP_INVAL)
+                               continue;
 
-               if (!zfs_prop_user(propname) && zfs_prop_readonly(prop))
-                       continue;
+                       if (zfs_prop_readonly(prop))
+                               continue;
+               }
 
                verify(nvpair_value_nvlist(elem, &propnv) == 0);
                if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
                    prop == ZFS_PROP_REFQUOTA ||
                    prop == ZFS_PROP_REFRESERVATION) {
-                       /* these guys are modifyable, but have no source */
+                       char *source;
                        uint64_t value;
                        verify(nvlist_lookup_uint64(propnv,
                            ZPROP_VALUE, &value) == 0);
                        if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
                                continue;
+                       /*
+                        * May have no source before SPA_VERSION_RECVD_PROPS,
+                        * but is still modifiable.
+                        */
+                       if (nvlist_lookup_string(propnv,
+                           ZPROP_SOURCE, &source) == 0) {
+                               if ((strcmp(source, zhp->zfs_name) != 0) &&
+                                   (strcmp(source,
+                                   ZPROP_SOURCE_VAL_RECVD) != 0))
+                                       continue;
+                       }
                } else {
                        char *source;
                        if (nvlist_lookup_string(propnv,
                            ZPROP_SOURCE, &source) != 0)
                                continue;
-                       if (strcmp(source, zhp->zfs_name) != 0)
+                       if ((strcmp(source, zhp->zfs_name) != 0) &&
+                           (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
                                continue;
                }
 
@@ -289,7 +684,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
 {
        send_data_t *sd = arg;
        nvlist_t *nvfs, *nv;
-       int rv;
+       int rv = 0;
        uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
        uint64_t guid = zhp->zfs_dmustats.dds_guid;
        char guidstring[64];
@@ -331,7 +726,8 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
        nvlist_free(nvfs);
 
        /* iterate over children */
-       rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
+       if (sd->recursive)
+               rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
 
        sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
 
@@ -341,7 +737,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
 
 static int
 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
-    const char *tosnap, nvlist_t **nvlp, avl_tree_t **avlp)
+    const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp)
 {
        zfs_handle_t *zhp;
        send_data_t sd = { 0 };
@@ -354,6 +750,7 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
        VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
        sd.fromsnap = fromsnap;
        sd.tosnap = tosnap;
+       sd.recursive = recursive;
 
        if ((error = send_iterate_fs(zhp, &sd)) != 0) {
                nvlist_free(sd.fss);
@@ -415,7 +812,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg)
                return (0);
 }
 
-static int
+int
 zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data)
 {
        int ret = 0;
@@ -446,13 +843,16 @@ typedef struct send_dump_data {
        /* these are all just the short snapname (the part after the @) */
        const char *fromsnap;
        const char *tosnap;
-       char lastsnap[ZFS_MAXNAMELEN];
+       char prevsnap[ZFS_MAXNAMELEN];
        boolean_t seenfrom, seento, replicate, doall, fromorigin;
        boolean_t verbose;
        int outfd;
        boolean_t err;
        nvlist_t *fss;
        avl_tree_t *fsavl;
+       snapfilter_cb_t *filter_cb;
+       void *filter_cb_arg;
+       nvlist_t *debugnv;
 } send_dump_data_t;
 
 /*
@@ -461,10 +861,11 @@ typedef struct send_dump_data {
  */
 static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
-    int outfd)
+    int outfd, boolean_t enoent_ok, boolean_t *got_enoent, nvlist_t *debugnv)
 {
        zfs_cmd_t zc = { 0 };
        libzfs_handle_t *hdl = zhp->zfs_hdl;
+       nvlist_t *thisdbg;
 
        assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
        assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin);
@@ -475,11 +876,26 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
        zc.zc_cookie = outfd;
        zc.zc_obj = fromorigin;
 
+       *got_enoent = B_FALSE;
+
+       VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
+       if (fromsnap && fromsnap[0] != '\0') {
+               VERIFY(0 == nvlist_add_string(thisdbg,
+                   "fromsnap", fromsnap));
+       }
+
        if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) {
                char errbuf[1024];
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
                    "warning: cannot send '%s'"), zhp->zfs_name);
 
+               VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
+               if (debugnv) {
+                       VERIFY(0 == nvlist_add_nvlist(debugnv,
+                           zhp->zfs_name, thisdbg));
+               }
+               nvlist_free(thisdbg);
+
                switch (errno) {
 
                case EXDEV:
@@ -488,6 +904,10 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
                        return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
                case ENOENT:
+                       if (enoent_ok) {
+                               *got_enoent = B_TRUE;
+                               return (0);
+                       }
                        if (zfs_dataset_exists(hdl, zc.zc_name,
                            ZFS_TYPE_SNAPSHOT)) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@@ -515,6 +935,10 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
                }
        }
 
+       if (debugnv)
+               VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
+       nvlist_free(thisdbg);
+
        return (0);
 }
 
@@ -524,13 +948,17 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
        send_dump_data_t *sdd = arg;
        const char *thissnap;
        int err;
+       boolean_t got_enoent;
+       boolean_t isfromsnap, istosnap;
+       boolean_t exclude = B_FALSE;
 
        thissnap = strchr(zhp->zfs_name, '@') + 1;
+       isfromsnap = (sdd->fromsnap != NULL &&
+           strcmp(sdd->fromsnap, thissnap) == 0);
 
-       if (sdd->fromsnap && !sdd->seenfrom &&
-           strcmp(sdd->fromsnap, thissnap) == 0) {
+       if (!sdd->seenfrom && isfromsnap) {
                sdd->seenfrom = B_TRUE;
-               (void) strcpy(sdd->lastsnap, thissnap);
+               (void) strcpy(sdd->prevsnap, thissnap);
                zfs_close(zhp);
                return (0);
        }
@@ -540,20 +968,63 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
                return (0);
        }
 
+       istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
+       if (istosnap)
+               sdd->seento = B_TRUE;
+
+       if (!sdd->doall && !isfromsnap && !istosnap) {
+               if (sdd->replicate) {
+                       char *snapname;
+                       nvlist_t *snapprops;
+                       /*
+                        * Filter out all intermediate snapshots except origin
+                        * snapshots needed to replicate clones.
+                        */
+                       nvlist_t *nvfs = fsavl_find(sdd->fsavl,
+                           zhp->zfs_dmustats.dds_guid, &snapname);
+
+                       VERIFY(0 == nvlist_lookup_nvlist(nvfs,
+                           "snapprops", &snapprops));
+                       VERIFY(0 == nvlist_lookup_nvlist(snapprops,
+                           thissnap, &snapprops));
+                       exclude = !nvlist_exists(snapprops, "is_clone_origin");
+               } else {
+                       exclude = B_TRUE;
+               }
+       }
+
+       /*
+        * If a filter function exists, call it to determine whether
+        * this snapshot will be sent.
+        */
+       if (exclude || (sdd->filter_cb != NULL &&
+           sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
+               /*
+                * This snapshot is filtered out.  Don't send it, and don't
+                * set prevsnap, so it will be as if this snapshot didn't
+                * exist, and the next accepted snapshot will be sent as
+                * an incremental from the last accepted one, or as the
+                * first (and full) snapshot in the case of a replication,
+                * non-incremental send.
+                */
+               zfs_close(zhp);
+               return (0);
+       }
+
        /* send it */
        if (sdd->verbose) {
                (void) fprintf(stderr, "sending from @%s to %s\n",
-                   sdd->lastsnap, zhp->zfs_name);
+                   sdd->prevsnap, zhp->zfs_name);
        }
 
-       err = dump_ioctl(zhp, sdd->lastsnap,
-           sdd->lastsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
-           sdd->outfd);
+       err = dump_ioctl(zhp, sdd->prevsnap,
+           sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
+           sdd->outfd, B_TRUE, &got_enoent, sdd->debugnv);
 
-       if (!sdd->seento && strcmp(sdd->tosnap, thissnap) == 0)
-               sdd->seento = B_TRUE;
-
-       (void) strcpy(sdd->lastsnap, thissnap);
+       if (got_enoent)
+               err = 0;
+       else
+               (void) strcpy(sdd->prevsnap, thissnap);
        zfs_close(zhp);
        return (err);
 }
@@ -592,51 +1063,32 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
                }
        }
 
-       if (sdd->doall) {
-               sdd->seenfrom = sdd->seento = sdd->lastsnap[0] = 0;
-               if (sdd->fromsnap == NULL || missingfrom)
-                       sdd->seenfrom = B_TRUE;
+       sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
+       if (sdd->fromsnap == NULL || missingfrom)
+               sdd->seenfrom = B_TRUE;
 
-               rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
-               if (!sdd->seenfrom) {
+       rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
+       if (!sdd->seenfrom) {
+               (void) fprintf(stderr,
+                   "WARNING: could not send %s@%s:\n"
+                   "incremental source (%s@%s) does not exist\n",
+                   zhp->zfs_name, sdd->tosnap,
+                   zhp->zfs_name, sdd->fromsnap);
+               sdd->err = B_TRUE;
+       } else if (!sdd->seento) {
+               if (sdd->fromsnap) {
                        (void) fprintf(stderr,
                            "WARNING: could not send %s@%s:\n"
-                           "incremental source (%s@%s) does not exist\n",
+                           "incremental source (%s@%s) "
+                           "is not earlier than it\n",
                            zhp->zfs_name, sdd->tosnap,
                            zhp->zfs_name, sdd->fromsnap);
-                       sdd->err = B_TRUE;
-               } else if (!sdd->seento) {
-                       if (sdd->fromsnap) {
-                               (void) fprintf(stderr,
-                                   "WARNING: could not send %s@%s:\n"
-                                   "incremental source (%s@%s) "
-                                   "is not earlier than it\n",
-                                   zhp->zfs_name, sdd->tosnap,
-                                   zhp->zfs_name, sdd->fromsnap);
-                       } else {
-                               (void) fprintf(stderr, "WARNING: "
-                                   "could not send %s@%s: does not exist\n",
-                                   zhp->zfs_name, sdd->tosnap);
-                       }
-                       sdd->err = B_TRUE;
-               }
-       } else {
-               zfs_handle_t *snapzhp;
-               char snapname[ZFS_MAXNAMELEN];
-
-               (void) snprintf(snapname, sizeof (snapname), "%s@%s",
-                   zfs_get_name(zhp), sdd->tosnap);
-               snapzhp = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT);
-               if (snapzhp == NULL) {
-                       rv = -1;
                } else {
-                       rv = dump_ioctl(snapzhp,
-                           missingfrom ? NULL : sdd->fromsnap,
-                           sdd->fromorigin || missingfrom,
-                           sdd->outfd);
-                       sdd->seento = B_TRUE;
-                       zfs_close(snapzhp);
+                       (void) fprintf(stderr, "WARNING: "
+                           "could not send %s@%s: does not exist\n",
+                           zhp->zfs_name, sdd->tosnap);
                }
+               sdd->err = B_TRUE;
        }
 
        return (rv);
@@ -652,6 +1104,29 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg)
        if (!sdd->replicate)
                return (dump_filesystem(rzhp, sdd));
 
+       /* Mark the clone origin snapshots. */
+       for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
+           fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
+               nvlist_t *nvfs;
+               uint64_t origin_guid = 0;
+
+               VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
+               (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
+               if (origin_guid != 0) {
+                       char *snapname;
+                       nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
+                           origin_guid, &snapname);
+                       if (origin_nv != NULL) {
+                               nvlist_t *snapprops;
+                               VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
+                                   "snapprops", &snapprops));
+                               VERIFY(0 == nvlist_lookup_nvlist(snapprops,
+                                   snapname, &snapprops));
+                               VERIFY(0 == nvlist_add_boolean(
+                                   snapprops, "is_clone_origin"));
+                       }
+               }
+       }
 again:
        needagain = progress = B_FALSE;
        for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
@@ -661,7 +1136,6 @@ again:
                zfs_handle_t *zhp;
                int err;
                uint64_t origin_guid = 0;
-               nvlist_t *origin_nv;
 
                VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
                if (nvlist_lookup_boolean(fslist, "sent") == 0)
@@ -670,15 +1144,19 @@ again:
                VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
                (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
 
-               origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL);
-               if (origin_nv &&
-                   nvlist_lookup_boolean(origin_nv, "sent") == ENOENT) {
-                       /*
-                        * origin has not been sent yet;
-                        * skip this clone.
-                        */
-                       needagain = B_TRUE;
-                       continue;
+               if (origin_guid != 0) {
+                       nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
+                           origin_guid, NULL);
+                       if (origin_nv != NULL &&
+                           nvlist_lookup_boolean(origin_nv,
+                           "sent") == ENOENT) {
+                               /*
+                                * origin has not been sent yet;
+                                * skip this clone.
+                                */
+                               needagain = B_TRUE;
+                               continue;
+                       }
                }
 
                zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
@@ -709,21 +1187,37 @@ again:
  *      is TRUE.
  *
  * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
- * uses a special header (with a version field of DMU_BACKUP_HEADER_VERSION)
+ * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
  * if "replicate" is set.  If "doall" is set, dump all the intermediate
- * snapshots. The DMU_BACKUP_HEADER_VERSION header is used in the "doall"
- * case too.
+ * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
+ * case too. If "props" is set, send properties.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    boolean_t replicate, boolean_t doall, boolean_t fromorigin,
-    boolean_t verbose, int outfd)
+    sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+    void *cb_arg, nvlist_t **debugnvp)
 {
        char errbuf[1024];
        send_dump_data_t sdd = { 0 };
        int err;
        nvlist_t *fss = NULL;
        avl_tree_t *fsavl = NULL;
+       char holdtag[128];
+       static uint64_t holdseq;
+       int spa_version;
+       boolean_t holdsnaps = B_FALSE;
+       pthread_t tid;
+       int pipefd[2];
+       dedup_arg_t dda = { 0 };
+       int featureflags = 0;
+
+       if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
+               uint64_t version;
+               version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+               if (version >= ZPL_VERSION_SA) {
+                       featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+               }
+       }
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "cannot send '%s'"), zhp->zfs_name);
@@ -734,15 +1228,48 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
        }
 
-       if (replicate || doall) {
+       if (zfs_spa_version(zhp, &spa_version) == 0 &&
+           spa_version >= SPA_VERSION_USERREFS)
+               holdsnaps = B_TRUE;
+
+       if (flags.dedup) {
+               featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
+                   DMU_BACKUP_FEATURE_DEDUPPROPS);
+               if (err = pipe(pipefd)) {
+                       zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+                       return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
+                           errbuf));
+               }
+               dda.outputfd = outfd;
+               dda.inputfd = pipefd[1];
+               dda.dedup_hdl = zhp->zfs_hdl;
+               if (err = pthread_create(&tid, NULL, cksummer, &dda)) {
+                       (void) close(pipefd[0]);
+                       (void) close(pipefd[1]);
+                       zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+                       return (zfs_error(zhp->zfs_hdl,
+                           EZFS_THREADCREATEFAILED, errbuf));
+               }
+       }
+
+       if (flags.replicate || flags.doall || flags.props) {
                dmu_replay_record_t drr = { 0 };
                char *packbuf = NULL;
                size_t buflen = 0;
                zio_cksum_t zc = { 0 };
 
-               assert(fromsnap || doall);
+               if (holdsnaps) {
+                       (void) snprintf(holdtag, sizeof (holdtag),
+                           ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
+                       ++holdseq;
+                       err = zfs_hold_range(zhp, fromsnap, tosnap,
+                           holdtag, flags.replicate, B_TRUE, filter_func,
+                           cb_arg);
+                       if (err)
+                               goto err_out;
+               }
 
-               if (replicate) {
+               if (flags.replicate || flags.props) {
                        nvlist_t *hdrnv;
 
                        VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
@@ -751,45 +1278,65 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                                    "fromsnap", fromsnap));
                        }
                        VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
+                       if (!flags.replicate) {
+                               VERIFY(0 == nvlist_add_boolean(hdrnv,
+                                   "not_recursive"));
+                       }
 
                        err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
-                           fromsnap, tosnap, &fss, &fsavl);
-                       if (err)
-                               return (err);
+                           fromsnap, tosnap, flags.replicate, &fss, &fsavl);
+                       if (err) {
+                               if (holdsnaps) {
+                                       (void) zfs_release_range(zhp, fromsnap,
+                                           tosnap, holdtag, flags.replicate);
+                               }
+                               goto err_out;
+                       }
                        VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
                        err = nvlist_pack(hdrnv, &packbuf, &buflen,
                            NV_ENCODE_XDR, 0);
-                       nvlist_free(hdrnv);
+                       if (debugnvp)
+                               *debugnvp = hdrnv;
+                       else
+                               nvlist_free(hdrnv);
                        if (err) {
                                fsavl_destroy(fsavl);
                                nvlist_free(fss);
-                               return (zfs_standard_error(zhp->zfs_hdl,
-                                   err, errbuf));
+                               if (holdsnaps) {
+                                       (void) zfs_release_range(zhp, fromsnap,
+                                           tosnap, holdtag, flags.replicate);
+                               }
+                               goto stderr_out;
                        }
                }
 
                /* write first begin record */
                drr.drr_type = DRR_BEGIN;
                drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-               drr.drr_u.drr_begin.drr_version = DMU_BACKUP_HEADER_VERSION;
+               DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo,
+                   DMU_COMPOUNDSTREAM);
+               DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo,
+                   featureflags);
                (void) snprintf(drr.drr_u.drr_begin.drr_toname,
                    sizeof (drr.drr_u.drr_begin.drr_toname),
                    "%s@%s", zhp->zfs_name, tosnap);
                drr.drr_payloadlen = buflen;
-               fletcher_4_incremental_native(&drr, sizeof (drr), &zc);
-               err = write(outfd, &drr, sizeof (drr));
+               err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
 
                /* write header nvlist */
-               if (err != -1) {
-                       fletcher_4_incremental_native(packbuf, buflen, &zc);
-                       err = write(outfd, packbuf, buflen);
+               if (err != -1 && packbuf != NULL) {
+                       err = cksum_and_write(packbuf, buflen, &zc, outfd);
                }
                free(packbuf);
                if (err == -1) {
                        fsavl_destroy(fsavl);
                        nvlist_free(fss);
-                       return (zfs_standard_error(zhp->zfs_hdl,
-                           errno, errbuf));
+                       if (holdsnaps) {
+                               (void) zfs_release_range(zhp, fromsnap, tosnap,
+                                   holdtag, flags.replicate);
+                       }
+                       err = errno;
+                       goto stderr_out;
                }
 
                /* write end record */
@@ -801,8 +1348,12 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                        if (err == -1) {
                                fsavl_destroy(fsavl);
                                nvlist_free(fss);
-                               return (zfs_standard_error(zhp->zfs_hdl,
-                                   errno, errbuf));
+                               err = errno;
+                               if (holdsnaps) {
+                                       (void) zfs_release_range(zhp, fromsnap,
+                                           tosnap, holdtag, flags.replicate);
+                               }
+                               goto stderr_out;
                        }
                }
        }
@@ -810,18 +1361,30 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        /* dump each stream */
        sdd.fromsnap = fromsnap;
        sdd.tosnap = tosnap;
-       sdd.outfd = outfd;
-       sdd.replicate = replicate;
-       sdd.doall = doall;
-       sdd.fromorigin = fromorigin;
+       if (flags.dedup)
+               sdd.outfd = pipefd[0];
+       else
+               sdd.outfd = outfd;
+       sdd.replicate = flags.replicate;
+       sdd.doall = flags.doall;
+       sdd.fromorigin = flags.fromorigin;
        sdd.fss = fss;
        sdd.fsavl = fsavl;
-       sdd.verbose = verbose;
+       sdd.verbose = flags.verbose;
+       sdd.filter_cb = filter_func;
+       sdd.filter_cb_arg = cb_arg;
+       if (debugnvp)
+               sdd.debugnv = *debugnvp;
        err = dump_filesystems(zhp, &sdd);
        fsavl_destroy(fsavl);
        nvlist_free(fss);
 
-       if (replicate || doall) {
+       if (flags.dedup) {
+               (void) close(pipefd[0]);
+               (void) pthread_join(tid, NULL);
+       }
+
+       if (flags.replicate || flags.doall || flags.props) {
                /*
                 * write final end record.  NB: want to do this even if
                 * there was some error, because it might not be totally
@@ -829,6 +1392,10 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                 */
                dmu_replay_record_t drr = { 0 };
                drr.drr_type = DRR_END;
+               if (holdsnaps) {
+                       (void) zfs_release_range(zhp, fromsnap, tosnap,
+                           holdtag, flags.replicate);
+               }
                if (write(outfd, &drr, sizeof (drr)) == -1) {
                        return (zfs_standard_error(zhp->zfs_hdl,
                            errno, errbuf));
@@ -836,6 +1403,16 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        }
 
        return (err || sdd.err);
+
+stderr_out:
+       err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
+err_out:
+       if (flags.dedup) {
+               (void) pthread_cancel(tid);
+               (void) pthread_join(tid, NULL);
+               (void) close(pipefd[0]);
+       }
+       return (err);
 }
 
 /*
@@ -1017,12 +1594,13 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
        changelist_free(clp);
 
        /*
-        * Deferred destroy should always succeed. Since we can't tell
-        * if it destroyed the dataset or just marked it for deferred
-        * destroy, always do the rename just in case.
+        * Deferred destroy might destroy the snapshot or only mark it to be
+        * destroyed later, and it returns success in either case.
         */
-       if (err != 0 || defer)
+       if (err != 0 || (defer && zfs_dataset_exists(hdl, name,
+           ZFS_TYPE_SNAPSHOT))) {
                err = recv_rename(hdl, name, NULL, baselen, newname, flags);
+       }
 
        return (err);
 }
@@ -1040,6 +1618,7 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 
        if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
                (void) strcpy(gtnd->name, zhp->zfs_name);
+               zfs_close(zhp);
                return (EEXIST);
        }
        err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
@@ -1130,18 +1709,22 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
 
 static int
 recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
-    recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl)
+    recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl,
+    nvlist_t *renamed)
 {
        nvlist_t *local_nv;
        avl_tree_t *local_avl;
        nvpair_t *fselem, *nextfselem;
-       char *tosnap, *fromsnap;
+       char *fromsnap;
        char newname[ZFS_MAXNAMELEN];
        int error;
-       boolean_t needagain, progress;
+       boolean_t needagain, progress, recursive;
+       char *s1, *s2;
 
        VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
-       VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap));
+
+       recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+           ENOENT);
 
        if (flags.dryrun)
                return (0);
@@ -1150,7 +1733,7 @@ again:
        needagain = progress = B_FALSE;
 
        if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
-           &local_nv, &local_avl)) != 0)
+           recursive, &local_nv, &local_avl)) != 0)
                return (error);
 
        /*
@@ -1273,7 +1856,7 @@ again:
                            stream_snapname, &props)) {
                                zfs_cmd_t zc = { 0 };
 
-                               zc.zc_cookie = B_TRUE; /* clear current props */
+                               zc.zc_cookie = B_TRUE; /* received */
                                (void) snprintf(zc.zc_name, sizeof (zc.zc_name),
                                    "%s@%s", fsname, nvpair_name(snapelem));
                                if (zcmd_write_src_nvlist(hdl, &zc,
@@ -1321,10 +1904,13 @@ again:
                        continue;
                }
 
-               if (fromguid == 0 && flags.verbose) {
-                       (void) printf("local fs %s does not have fromsnap "
-                           "(%s in stream); must have been deleted locally; "
-                           "ignoring\n", fsname, fromsnap);
+               if (fromguid == 0) {
+                       if (flags.verbose) {
+                               (void) printf("local fs %s does not have "
+                                   "fromsnap (%s in stream); must have "
+                                   "been deleted locally; ignoring\n",
+                                   fsname, fromsnap);
+                       }
                        continue;
                }
 
@@ -1333,11 +1919,19 @@ again:
                VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
                    "parentfromsnap", &stream_parent_fromsnap_guid));
 
-               /* check for rename */
+               s1 = strrchr(fsname, '/');
+               s2 = strrchr(stream_fsname, '/');
+
+               /*
+                * Check for rename. If the exact receive path is specified, it
+                * does not count as a rename, but we still need to check the
+                * datasets beneath it.
+                */
                if ((stream_parent_fromsnap_guid != 0 &&
+                   parent_fromsnap_guid != 0 &&
                    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
-                   strcmp(strrchr(fsname, '/'),
-                   strrchr(stream_fsname, '/')) != 0) {
+                   ((flags.isprefix || strcmp(tofs, fsname) != 0) &&
+                   (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
                        nvlist_t *parent;
                        char tryname[ZFS_MAXNAMELEN];
 
@@ -1365,8 +1959,16 @@ again:
                                }
                        }
 
+                       newname[0] = '\0';
+
                        error = recv_rename(hdl, fsname, tryname,
                            strlen(tofs)+1, newname, flags);
+
+                       if (renamed != NULL && newname[0] != '\0') {
+                               VERIFY(0 == nvlist_add_boolean(renamed,
+                                   newname));
+                       }
+
                        if (error)
                                needagain = B_TRUE;
                        else
@@ -1395,37 +1997,28 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
        nvlist_t *stream_nv = NULL;
        avl_tree_t *stream_avl = NULL;
        char *fromsnap = NULL;
+       char *cp;
        char tofs[ZFS_MAXNAMELEN];
+       char sendfs[ZFS_MAXNAMELEN];
        char errbuf[1024];
        dmu_replay_record_t drre;
        int error;
        boolean_t anyerr = B_FALSE;
        boolean_t softerr = B_FALSE;
+       boolean_t recursive;
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "cannot receive"));
 
-       if (strchr(destname, '@')) {
-               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                   "can not specify snapshot name for multi-snapshot stream"));
-               return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
-       }
-
        assert(drr->drr_type == DRR_BEGIN);
        assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
-       assert(drr->drr_u.drr_begin.drr_version == DMU_BACKUP_HEADER_VERSION);
+       assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
+           DMU_COMPOUNDSTREAM);
 
        /*
         * Read in the nvlist from the stream.
         */
        if (drr->drr_payloadlen != 0) {
-               if (!flags.isprefix) {
-                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "must use -d to receive replication "
-                           "(send -R) stream"));
-                       return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
-               }
-
                error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
                    &stream_nv, flags.byteswap, zc);
                if (error) {
@@ -1434,6 +2027,16 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
                }
        }
 
+       recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+           ENOENT);
+
+       if (recursive && strchr(destname, '@')) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "cannot specify snapshot name for multi-snapshot stream"));
+               error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
+               goto out;
+       }
+
        /*
         * Read in the end record and verify checksum.
         */
@@ -1477,21 +2080,73 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
                }
 
                if (fromsnap != NULL) {
+                       nvlist_t *renamed = NULL;
+                       nvpair_t *pair = NULL;
+
                        (void) strlcpy(tofs, destname, ZFS_MAXNAMELEN);
                        if (flags.isprefix) {
-                               int i = strcspn(drr->drr_u.drr_begin.drr_toname,
-                                   "/@");
+                               struct drr_begin *drrb = &drr->drr_u.drr_begin;
+                               int i;
+
+                               if (flags.istail) {
+                                       cp = strrchr(drrb->drr_toname, '/');
+                                       if (cp == NULL) {
+                                               (void) strlcat(tofs, "/",
+                                                   ZFS_MAXNAMELEN);
+                                               i = 0;
+                                       } else {
+                                               i = (cp - drrb->drr_toname);
+                                       }
+                               } else {
+                                       i = strcspn(drrb->drr_toname, "/@");
+                               }
                                /* zfs_receive_one() will create_parents() */
-                               (void) strlcat(tofs,
-                                   &drr->drr_u.drr_begin.drr_toname[i],
+                               (void) strlcat(tofs, &drrb->drr_toname[i],
                                    ZFS_MAXNAMELEN);
                                *strchr(tofs, '@') = '\0';
                        }
-                       softerr = recv_incremental_replication(hdl, tofs,
-                           flags, stream_nv, stream_avl);
+
+                       if (recursive && !flags.dryrun && !flags.nomount) {
+                               VERIFY(0 == nvlist_alloc(&renamed,
+                                   NV_UNIQUE_NAME, 0));
+                       }
+
+                       softerr = recv_incremental_replication(hdl, tofs, flags,
+                           stream_nv, stream_avl, renamed);
+
+                       /* Unmount renamed filesystems before receiving. */
+                       while ((pair = nvlist_next_nvpair(renamed,
+                           pair)) != NULL) {
+                               zfs_handle_t *zhp;
+                               prop_changelist_t *clp = NULL;
+
+                               zhp = zfs_open(hdl, nvpair_name(pair),
+                                   ZFS_TYPE_FILESYSTEM);
+                               if (zhp != NULL) {
+                                       clp = changelist_gather(zhp,
+                                           ZFS_PROP_MOUNTPOINT, 0, 0);
+                                       zfs_close(zhp);
+                                       if (clp != NULL) {
+                                               softerr |=
+                                                   changelist_prefix(clp);
+                                               changelist_free(clp);
+                                       }
+                               }
+                       }
+
+                       nvlist_free(renamed);
                }
        }
 
+       /*
+        * Get the fs specified by the first path in the stream (the top level
+        * specified by 'zfs send') and pass it to each invocation of
+        * zfs_receive_one().
+        */
+       (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
+           ZFS_MAXNAMELEN);
+       if ((cp = strchr(sendfs, '@')) != NULL)
+               *cp = '\0';
 
        /* Finally, receive each contained stream */
        do {
@@ -1503,7 +2158,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
                 * recv_skip() and return 0).
                 */
                error = zfs_receive_impl(hdl, destname, flags, fd,
-                   stream_avl, top_zfs);
+                   sendfs, stream_nv, stream_avl, top_zfs);
                if (error == ENODATA) {
                        error = 0;
                        break;
@@ -1517,7 +2172,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
                 * renames again.
                 */
                softerr = recv_incremental_replication(hdl, tofs, flags,
-                   stream_nv, stream_avl);
+                   stream_nv, stream_avl, NULL);
        }
 
 out:
@@ -1531,11 +2186,28 @@ out:
        return (error);
 }
 
+static void
+trunc_prop_errs(int truncated)
+{
+       ASSERT(truncated != 0);
+
+       if (truncated == 1)
+               (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+                   "1 more property could not be set\n"));
+       else
+               (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+                   "%d more properties could not be set\n"), truncated);
+}
+
 static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
        dmu_replay_record_t *drr;
        void *buf = malloc(1<<20);
+       char errbuf[1024];
+
+       (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+           "cannot receive:"));
 
        /* XXX would be great to use lseek if possible... */
        drr = buf;
@@ -1548,7 +2220,11 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
                switch (drr->drr_type) {
                case DRR_BEGIN:
                        /* NB: not to be used on v2 stream packages */
-                       assert(drr->drr_payloadlen == 0);
+                       if (drr->drr_payloadlen != 0) {
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "invalid substream header"));
+                               return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+                       }
                        break;
 
                case DRR_END:
@@ -1574,13 +2250,23 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
                        (void) recv_read(hdl, fd, buf,
                            drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
                        break;
-
+               case DRR_SPILL:
+                       if (byteswap) {
+                               drr->drr_u.drr_write.drr_length =
+                                   BSWAP_64(drr->drr_u.drr_spill.drr_length);
+                       }
+                       (void) recv_read(hdl, fd, buf,
+                           drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
+                       break;
+               case DRR_WRITE_BYREF:
                case DRR_FREEOBJECTS:
                case DRR_FREE:
                        break;
 
                default:
-                       assert(!"invalid record type");
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "invalid record type"));
+                       return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
                }
        }
 
@@ -1594,27 +2280,33 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
     recvflags_t flags, dmu_replay_record_t *drr,
-    dmu_replay_record_t *drr_noswap, avl_tree_t *stream_avl,
-    char **top_zfs)
+    dmu_replay_record_t *drr_noswap, const char *sendfs,
+    nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs)
 {
        zfs_cmd_t zc = { 0 };
        time_t begin_time;
-       int ioctl_err, ioctl_errno, err, choplen;
+       int ioctl_err, ioctl_errno, err;
        char *cp;
        struct drr_begin *drrb = &drr->drr_u.drr_begin;
        char errbuf[1024];
-       char chopprefix[ZFS_MAXNAMELEN];
+       char prop_errbuf[1024];
+       const char *chopprefix;
        boolean_t newfs = B_FALSE;
        boolean_t stream_wantsnewfs;
        uint64_t parent_snapguid = 0;
        prop_changelist_t *clp = NULL;
        nvlist_t *snapprops_nvlist = NULL;
+       zprop_errflags_t prop_errflags;
+       boolean_t recursive;
 
        begin_time = time(NULL);
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "cannot receive"));
 
+       recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+           ENOENT);
+
        if (stream_avl != NULL) {
                char *snapname;
                nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
@@ -1645,6 +2337,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                        return (-1);
        }
 
+       cp = NULL;
+
        /*
         * Determine how much of the snapshot name stored in the stream
         * we are going to tack on to the name they specified on the
@@ -1653,38 +2347,77 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
         * If they specified a snapshot, chop the entire name stored in
         * the stream.
         */
-       (void) strcpy(chopprefix, drrb->drr_toname);
-       if (flags.isprefix) {
+       if (flags.istail) {
+               /*
+                * A filesystem was specified with -e. We want to tack on only
+                * the tail of the sent snapshot path.
+                */
+               if (strchr(tosnap, '@')) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
+                           "argument - snapshot not allowed with -e"));
+                       return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
+               }
+
+               chopprefix = strrchr(sendfs, '/');
+
+               if (chopprefix == NULL) {
+                       /*
+                        * The tail is the poolname, so we need to
+                        * prepend a path separator.
+                        */
+                       int len = strlen(drrb->drr_toname);
+                       cp = malloc(len + 2);
+                       cp[0] = '/';
+                       (void) strcpy(&cp[1], drrb->drr_toname);
+                       chopprefix = cp;
+               } else {
+                       chopprefix = drrb->drr_toname + (chopprefix - sendfs);
+               }
+       } else if (flags.isprefix) {
                /*
-                * They specified a fs with -d, we want to tack on
-                * everything but the pool name stored in the stream
+                * A filesystem was specified with -d. We want to tack on
+                * everything but the first element of the sent snapshot path
+                * (all but the pool name).
                 */
                if (strchr(tosnap, '@')) {
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
                            "argument - snapshot not allowed with -d"));
                        return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
                }
-               cp = strchr(chopprefix, '/');
-               if (cp == NULL)
-                       cp = strchr(chopprefix, '@');
-               *cp = '\0';
+
+               chopprefix = strchr(drrb->drr_toname, '/');
+               if (chopprefix == NULL)
+                       chopprefix = strchr(drrb->drr_toname, '@');
        } else if (strchr(tosnap, '@') == NULL) {
                /*
-                * If they specified a filesystem without -d, we want to
-                * tack on everything after the fs specified in the
-                * first name from the stream.
+                * If a filesystem was specified without -d or -e, we want to
+                * tack on everything after the fs specified by 'zfs send'.
                 */
-               cp = strchr(chopprefix, '@');
-               *cp = '\0';
+               chopprefix = drrb->drr_toname + strlen(sendfs);
+       } else {
+               /* A snapshot was specified as an exact path (no -d or -e). */
+               if (recursive) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "cannot specify snapshot name for multi-snapshot "
+                           "stream"));
+                       return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+               }
+               chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
        }
-       choplen = strlen(chopprefix);
+
+       ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
+       ASSERT(chopprefix > drrb->drr_toname);
+       ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname));
+       ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' ||
+           chopprefix[0] == '\0');
 
        /*
         * Determine name of destination snapshot, store in zc_value.
         */
+       (void) strcpy(zc.zc_top_ds, tosnap);
        (void) strcpy(zc.zc_value, tosnap);
-       (void) strncat(zc.zc_value, drrb->drr_toname+choplen,
-           sizeof (zc.zc_value));
+       (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
+       free(cp);
        if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
                zcmd_free_nvlists(&zc);
                return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
@@ -1742,7 +2475,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                (void) strcpy(zc.zc_name, zc.zc_value);
                *strchr(zc.zc_name, '@') = '\0';
 
-               if (!zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
+               /*
+                * If the exact receive path was specified and this is the
+                * topmost path in the stream, then if the fs does not exist we
+                * should look no further.
+                */
+               if ((flags.isprefix || (*(chopprefix = drrb->drr_toname +
+                   strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
+                   !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
                        char snap[ZFS_MAXNAMELEN];
                        (void) strcpy(snap, strchr(zc.zc_value, '@'));
                        if (guid_to_name(hdl, tosnap, drrb->drr_fromguid,
@@ -1758,6 +2498,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 
        if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
                zfs_handle_t *zhp;
+
                /*
                 * Destination fs exists.  Therefore this should either
                 * be an incremental, or the stream specifies a new fs
@@ -1765,7 +2506,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 * away (and have therefore specified -F and removed any
                 * snapshots).
                 */
-
                if (stream_wantsnewfs) {
                        if (!flags.force) {
                                zcmd_free_nvlists(&zc);
@@ -1819,12 +2559,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                                return (-1);
                        }
                }
-               if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME &&
-                   zvol_remove_link(hdl, zhp->zfs_name) != 0) {
-                       zfs_close(zhp);
-                       zcmd_free_nvlists(&zc);
-                       return (-1);
-               }
                zfs_close(zhp);
        } else {
                /*
@@ -1848,7 +2582,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 */
                *cp = '\0';
 
-               if (flags.isprefix && !flags.dryrun &&
+               if (flags.isprefix && !flags.istail && !flags.dryrun &&
                    create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
                        zcmd_free_nvlists(&zc);
                        return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
@@ -1873,21 +2607,59 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                return (recv_skip(hdl, infd, flags.byteswap));
        }
 
+       zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
+       zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
+
        err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
        ioctl_errno = errno;
+       prop_errflags = (zprop_errflags_t)zc.zc_obj;
+
+       if (err == 0) {
+               nvlist_t *prop_errors;
+               VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
+                   zc.zc_nvlist_dst_size, &prop_errors, 0));
+
+               nvpair_t *prop_err = NULL;
+
+               while ((prop_err = nvlist_next_nvpair(prop_errors,
+                   prop_err)) != NULL) {
+                       char tbuf[1024];
+                       zfs_prop_t prop;
+                       int intval;
+
+                       prop = zfs_name_to_prop(nvpair_name(prop_err));
+                       (void) nvpair_value_int32(prop_err, &intval);
+                       if (strcmp(nvpair_name(prop_err),
+                           ZPROP_N_MORE_ERRORS) == 0) {
+                               trunc_prop_errs(intval);
+                               break;
+                       } else {
+                               (void) snprintf(tbuf, sizeof (tbuf),
+                                   dgettext(TEXT_DOMAIN,
+                                   "cannot receive %s property on %s"),
+                                   nvpair_name(prop_err), zc.zc_name);
+                               zfs_setprop_error(hdl, prop, intval, tbuf);
+                       }
+               }
+               nvlist_free(prop_errors);
+       }
+
+       zc.zc_nvlist_dst = 0;
+       zc.zc_nvlist_dst_size = 0;
        zcmd_free_nvlists(&zc);
 
        if (err == 0 && snapprops_nvlist) {
                zfs_cmd_t zc2 = { 0 };
 
                (void) strcpy(zc2.zc_name, zc.zc_value);
+               zc2.zc_cookie = B_TRUE; /* received */
                if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
                        (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
                        zcmd_free_nvlists(&zc2);
                }
        }
 
-       if (err && (ioctl_errno == ENOENT || ioctl_errno == ENODEV)) {
+       if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) {
                /*
                 * It may be that this snapshot already exists,
                 * in which case we want to consume & ignore it
@@ -1895,7 +2667,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 */
                avl_tree_t *local_avl;
                nvlist_t *local_nv, *fs;
-               char *cp = strchr(zc.zc_value, '@');
+               cp = strchr(zc.zc_value, '@');
 
                /*
                 * XXX Do this faster by just iterating over snaps in
@@ -1903,7 +2675,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 * get a strange "does not exist" error message.
                 */
                *cp = '\0';
-               if (gather_nvlist(hdl, zc.zc_value, NULL, NULL,
+               if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
                    &local_nv, &local_avl) == 0) {
                        *cp = '@';
                        fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
@@ -1915,14 +2687,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                                        (void) printf("snap %s already exists; "
                                            "ignoring\n", zc.zc_value);
                                }
-                               ioctl_err = recv_skip(hdl, infd,
+                               err = ioctl_err = recv_skip(hdl, infd,
                                    flags.byteswap);
                        }
                }
                *cp = '@';
        }
 
-
        if (ioctl_err != 0) {
                switch (ioctl_errno) {
                case ENODEV:
@@ -1961,18 +2732,25 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                            "invalid stream (checksum mismatch)"));
                        (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
                        break;
+               case ENOTSUP:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "pool must be upgraded to receive this stream."));
+                       (void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+                       break;
+               case EDQUOT:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "destination %s space quota exceeded"), zc.zc_name);
+                       (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
+                       break;
                default:
                        (void) zfs_standard_error(hdl, ioctl_errno, errbuf);
                }
        }
 
        /*
-        * Mount or recreate the /dev links for the target filesystem
-        * (if created, or if we tore them down to do an incremental
-        * restore), and the /dev links for the new snapshot (if
-        * created). Also mount any children of the target filesystem
-        * if we did a replication receive (indicated by stream_avl
-        * being non-NULL).
+        * Mount the target filesystem (if created).  Also mount any
+        * children of the target filesystem if we did a replication
+        * receive (indicated by stream_avl being non-NULL).
         */
        cp = strchr(zc.zc_value, '@');
        if (cp && (ioctl_err == 0 || !newfs)) {
@@ -1984,10 +2762,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                if (h != NULL) {
                        if (h->zfs_type == ZFS_TYPE_VOLUME) {
                                *cp = '@';
-                               err = zvol_create_link(hdl, h->zfs_name);
-                               if (err == 0 && ioctl_err == 0)
-                                       err = zvol_create_link(hdl,
-                                           zc.zc_value);
                        } else if (newfs || stream_avl) {
                                /*
                                 * Track the first/top of hierarchy fs,
@@ -2006,6 +2780,19 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                changelist_free(clp);
        }
 
+       if (prop_errflags & ZPROP_ERR_NOCLEAR) {
+               (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
+                   "failed to clear unreceived properties on %s"),
+                   zc.zc_name);
+               (void) fprintf(stderr, "\n");
+       }
+       if (prop_errflags & ZPROP_ERR_NORESTORE) {
+               (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
+                   "failed to restore original properties on %s"),
+                   zc.zc_name);
+               (void) fprintf(stderr, "\n");
+       }
+
        if (err || ioctl_err)
                return (-1);
 
@@ -2028,13 +2815,16 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 
 static int
 zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
-    int infd, avl_tree_t *stream_avl, char **top_zfs)
+    int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl,
+    char **top_zfs)
 {
        int err;
        dmu_replay_record_t drr, drr_noswap;
        struct drr_begin *drrb = &drr.drr_u.drr_begin;
        char errbuf[1024];
        zio_cksum_t zcksum = { 0 };
+       uint64_t featureflags;
+       int hdrtype;
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "cannot receive"));
@@ -2072,7 +2862,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
                drr.drr_type = BSWAP_32(drr.drr_type);
                drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
                drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-               drrb->drr_version = BSWAP_64(drrb->drr_version);
+               drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
                drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
                drrb->drr_type = BSWAP_32(drrb->drr_type);
                drrb->drr_flags = BSWAP_32(drrb->drr_flags);
@@ -2086,23 +2876,45 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
                return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
        }
 
+       featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+       hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
+
+       if (!DMU_STREAM_SUPPORTED(featureflags) ||
+           (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "stream has unsupported feature, feature flags = %lx"),
+                   featureflags);
+               return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+       }
+
        if (strchr(drrb->drr_toname, '@') == NULL) {
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
                    "stream (bad snapshot name)"));
                return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
        }
 
-       if (drrb->drr_version == DMU_BACKUP_STREAM_VERSION) {
+       if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
+               char nonpackage_sendfs[ZFS_MAXNAMELEN];
+               if (sendfs == NULL) {
+                       /*
+                        * We were not called from zfs_receive_package(). Get
+                        * the fs specified by 'zfs send'.
+                        */
+                       char *cp;
+                       (void) strlcpy(nonpackage_sendfs,
+                           drr.drr_u.drr_begin.drr_toname, ZFS_MAXNAMELEN);
+                       if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
+                               *cp = '\0';
+                       sendfs = nonpackage_sendfs;
+               }
                return (zfs_receive_one(hdl, infd, tosnap, flags,
-                   &drr, &drr_noswap, stream_avl, top_zfs));
-       } else if (drrb->drr_version == DMU_BACKUP_HEADER_VERSION) {
+                   &drr, &drr_noswap, sendfs, stream_nv, stream_avl,
+                   top_zfs));
+       } else {
+               assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+                   DMU_COMPOUNDSTREAM);
                return (zfs_receive_package(hdl, infd, tosnap, flags,
                    &drr, &zcksum, top_zfs));
-       } else {
-               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                   "stream is unsupported version %llu"),
-                   drrb->drr_version);
-               return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
        }
 }
 
@@ -2119,7 +2931,8 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
        char *top_zfs = NULL;
        int err;
 
-       err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs);
+       err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL,
+           stream_avl, &top_zfs);
 
        if (err == 0 && !flags.nomount && top_zfs) {
                zfs_handle_t *zhp;
index 44faf02..24725ec 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -138,7 +137,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
                        if (find_vdev_problem(child[c], func))
                                return (B_TRUE);
        } else {
-               verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
+               verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
                    (uint64_t **)&vs, &c) == 0);
 
                if (func(vs->vs_state, vs->vs_aux,
@@ -173,7 +172,8 @@ check_status(nvlist_t *config, boolean_t isimport)
 {
        nvlist_t *nvroot;
        vdev_stat_t *vs;
-       uint_t vsc;
+       pool_scan_stat_t *ps = NULL;
+       uint_t vsc, psc;
        uint64_t nerr;
        uint64_t version;
        uint64_t stateval;
@@ -184,15 +184,24 @@ check_status(nvlist_t *config, boolean_t isimport)
            &version) == 0);
        verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
            &nvroot) == 0);
-       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+       verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
            (uint64_t **)&vs, &vsc) == 0);
        verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
            &stateval) == 0);
-       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+       /*
+        * Currently resilvering a vdev
+        */
+       (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+           (uint64_t **)&ps, &psc);
+       if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+           ps->pss_state == DSS_SCANNING)
+               return (ZPOOL_STATUS_RESILVERING);
 
        /*
         * Pool last accessed by another system.
         */
+       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
        if (hostid != 0 && (unsigned long)hostid != gethostid() &&
            stateval == POOL_STATE_ACTIVE)
                return (ZPOOL_STATUS_HOSTID_MISMATCH);
@@ -289,12 +298,6 @@ check_status(nvlist_t *config, boolean_t isimport)
                return (ZPOOL_STATUS_REMOVED_DEV);
 
        /*
-        * Currently resilvering
-        */
-       if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
-               return (ZPOOL_STATUS_RESILVERING);
-
-       /*
         * Outdated, but usable, version
         */
        if (version < SPA_VERSION)
@@ -328,3 +331,68 @@ zpool_import_status(nvlist_t *config, char **msgid)
 
        return (ret);
 }
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+       char refcnt[6];
+       char blocks[6], lsize[6], psize[6], dsize[6];
+       char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+       if (dds == NULL || dds->dds_blocks == 0)
+               return;
+
+       if (h == -1)
+               (void) strcpy(refcnt, "Total");
+       else
+               zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
+
+       zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
+       zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize));
+       zfs_nicenum(dds->dds_psize, psize, sizeof (psize));
+       zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize));
+       zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
+       zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
+       zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
+       zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
+
+       (void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+           refcnt,
+           blocks, lsize, psize, dsize,
+           ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+/*
+ * Print the DDT histogram and the column totals.
+ */
+void
+zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
+{
+       int h;
+
+       (void) printf("\n");
+
+       (void) printf("bucket   "
+           "           allocated             "
+           "          referenced          \n");
+       (void) printf("______   "
+           "______________________________   "
+           "______________________________\n");
+
+       (void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+           "refcnt",
+           "blocks", "LSIZE", "PSIZE", "DSIZE",
+           "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+       (void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+           "------",
+           "------", "-----", "-----", "-----",
+           "------", "-----", "-----", "-----");
+
+       for (h = 0; h < 64; h++)
+               dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+       dump_ddt_stat(dds_total, -1);
+
+       (void) printf("\n");
+}
index 4da0fb4..2e73f76 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -94,8 +93,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_VOLTOOBIG:
                return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
                    "this system"));
-       case EZFS_VOLHASDATA:
-               return (dgettext(TEXT_DOMAIN, "volume has data"));
        case EZFS_INVALIDNAME:
                return (dgettext(TEXT_DOMAIN, "invalid name"));
        case EZFS_BADRESTORE:
@@ -138,16 +135,12 @@ libzfs_error_description(libzfs_handle_t *hdl)
                return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
        case EZFS_SHARESMBFAILED:
                return (dgettext(TEXT_DOMAIN, "smb add share failed"));
-       case EZFS_ISCSISVCUNAVAIL:
-               return (dgettext(TEXT_DOMAIN,
-                   "iscsitgt service need to be enabled by "
-                   "a privileged user"));
-       case EZFS_DEVLINKS:
-               return (dgettext(TEXT_DOMAIN, "failed to create /dev links"));
        case EZFS_PERM:
                return (dgettext(TEXT_DOMAIN, "permission denied"));
        case EZFS_NOSPC:
                return (dgettext(TEXT_DOMAIN, "out of space"));
+       case EZFS_FAULT:
+               return (dgettext(TEXT_DOMAIN, "bad address"));
        case EZFS_IO:
                return (dgettext(TEXT_DOMAIN, "I/O error"));
        case EZFS_INTR:
@@ -161,12 +154,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
                return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
        case EZFS_NOHISTORY:
                return (dgettext(TEXT_DOMAIN, "no history available"));
-       case EZFS_UNSHAREISCSIFAILED:
-               return (dgettext(TEXT_DOMAIN,
-                   "iscsitgtd failed request to unshare"));
-       case EZFS_SHAREISCSIFAILED:
-               return (dgettext(TEXT_DOMAIN,
-                   "iscsitgtd failed request to share"));
        case EZFS_POOLPROPS:
                return (dgettext(TEXT_DOMAIN, "failed to retrieve "
                    "pool properties"));
@@ -218,6 +205,20 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_REFTAG_HOLD:
                return (dgettext(TEXT_DOMAIN, "tag already exists on this "
                    "dataset"));
+       case EZFS_TAGTOOLONG:
+               return (dgettext(TEXT_DOMAIN, "tag too long"));
+       case EZFS_PIPEFAILED:
+               return (dgettext(TEXT_DOMAIN, "pipe create failed"));
+       case EZFS_THREADCREATEFAILED:
+               return (dgettext(TEXT_DOMAIN, "thread create failed"));
+       case EZFS_POSTSPLIT_ONLINE:
+               return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
+                   "into a new one"));
+       case EZFS_SCRUBBING:
+               return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
+                   "use 'zpool scrub -s' to cancel current scrub"));
+       case EZFS_NO_SCRUB:
+               return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
        case EZFS_UNKNOWN:
                return (dgettext(TEXT_DOMAIN, "unknown error"));
        default:
@@ -306,6 +307,10 @@ zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt,
                zfs_verror(hdl, EZFS_IO, fmt, ap);
                return (-1);
 
+       case EFAULT:
+               zfs_verror(hdl, EZFS_FAULT, fmt, ap);
+               return (-1);
+
        case EINTR:
                zfs_verror(hdl, EZFS_INTR, fmt, ap);
                return (-1);
@@ -378,7 +383,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
                zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
                break;
        default:
-               zfs_error_aux(hdl, strerror(errno));
+               zfs_error_aux(hdl, strerror(error));
                zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
                break;
        }
@@ -610,6 +615,7 @@ libzfs_fini(libzfs_handle_t *hdl)
        if (hdl->libzfs_log_str)
                (void) free(hdl->libzfs_log_str);
        zpool_free_handles(hdl);
+       libzfs_fru_clear(hdl, B_TRUE);
        namespace_clear(hdl);
        libzfs_mnttab_fini(hdl);
        free(hdl);
@@ -686,7 +692,7 @@ int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
        if (len == 0)
-               len = 2048;
+               len = 4*1024;
        zc->zc_nvlist_dst_size = len;
        if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
            zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL)
@@ -812,6 +818,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
            "PROPERTY"));
        cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
            "VALUE"));
+       cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
+           "RECEIVED"));
        cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
            "SOURCE"));
 
@@ -825,7 +833,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
         * inheriting from the longest name.  This is acceptable because in the
         * majority of cases 'SOURCE' is the last column displayed, and we don't
         * use the width anyway.  Note that the 'VALUE' column can be oversized,
-        * if the name of the property is much longer the any values we find.
+        * if the name of the property is much longer than any values we find.
         */
        for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
                /*
@@ -856,6 +864,11 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
                    pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
                        cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
 
+               /* 'RECEIVED' column. */
+               if (pl != cbp->cb_proplist &&
+                   pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
+                       cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
+
                /*
                 * 'NAME' and 'SOURCE' columns
                 */
@@ -871,7 +884,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
        /*
         * Now go through and print the headers.
         */
-       for (i = 0; i < 4; i++) {
+       for (i = 0; i < ZFS_GET_NCOLS; i++) {
                switch (cbp->cb_columns[i]) {
                case GET_COL_NAME:
                        title = dgettext(TEXT_DOMAIN, "NAME");
@@ -882,6 +895,9 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
                case GET_COL_VALUE:
                        title = dgettext(TEXT_DOMAIN, "VALUE");
                        break;
+               case GET_COL_RECVD:
+                       title = dgettext(TEXT_DOMAIN, "RECEIVED");
+                       break;
                case GET_COL_SOURCE:
                        title = dgettext(TEXT_DOMAIN, "SOURCE");
                        break;
@@ -890,7 +906,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
                }
 
                if (title != NULL) {
-                       if (i == 3 || cbp->cb_columns[i + 1] == 0)
+                       if (i == (ZFS_GET_NCOLS - 1) ||
+                           cbp->cb_columns[i + 1] == GET_COL_NONE)
                                (void) printf("%s", title);
                        else
                                (void) printf("%-*s  ",
@@ -908,7 +925,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 void
 zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
     const char *propname, const char *value, zprop_source_t sourcetype,
-    const char *source)
+    const char *source, const char *recvd_value)
 {
        int i;
        const char *str;
@@ -923,7 +940,7 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
        if (cbp->cb_first)
                zprop_print_headers(cbp, cbp->cb_type);
 
-       for (i = 0; i < 4; i++) {
+       for (i = 0; i < ZFS_GET_NCOLS; i++) {
                switch (cbp->cb_columns[i]) {
                case GET_COL_NAME:
                        str = name;
@@ -960,14 +977,21 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
                                    "inherited from %s", source);
                                str = buf;
                                break;
+                       case ZPROP_SRC_RECEIVED:
+                               str = "received";
+                               break;
                        }
                        break;
 
+               case GET_COL_RECVD:
+                       str = (recvd_value == NULL ? "-" : recvd_value);
+                       break;
+
                default:
                        continue;
                }
 
-               if (cbp->cb_columns[i + 1] == 0)
+               if (cbp->cb_columns[i + 1] == GET_COL_NONE)
                        (void) printf("%s", str);
                else if (cbp->cb_scripted)
                        (void) printf("%s\t", str);
@@ -975,7 +999,6 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
                        (void) printf("%-*s  ",
                            cbp->cb_colwidths[cbp->cb_columns[i]],
                            str);
-
        }
 
        (void) printf("\n");
@@ -1037,7 +1060,7 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
                return (-1);
        }
 
-       /* Rely on stroull() to process the numeric portion.  */
+       /* Rely on strtoull() to process the numeric portion.  */
        errno = 0;
        *num = strtoull(value, &end, 10);
 
index 230c233..9a6d712 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -75,6 +75,7 @@ extern "C" {
 #include <sys/u8_textprep.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sysevent/dev.h>
+#include <sys/sunddi.h>
 
 /*
  * Debugging
@@ -105,21 +106,27 @@ extern void vpanic(const char *, __va_list);
 
 #define        fm_panic        panic
 
+extern int aok;
+
 /* This definition is copied from assert.h. */
 #if defined(__STDC__)
 #if __STDC_VERSION__ - 0 >= 199901L
-#define        verify(EX) (void)((EX) || \
+#define        zverify(EX) (void)((EX) || (aok) || \
        (__assert_c99(#EX, __FILE__, __LINE__, __func__), 0))
 #else
-#define        verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0))
+#define        zverify(EX) (void)((EX) || (aok) || \
+       (__assert(#EX, __FILE__, __LINE__), 0))
 #endif /* __STDC_VERSION__ - 0 >= 199901L */
 #else
-#define        verify(EX) (void)((EX) || (_assert("EX", __FILE__, __LINE__), 0))
+#define        zverify(EX) (void)((EX) || (aok) || \
+       (_assert("EX", __FILE__, __LINE__), 0))
 #endif /* __STDC__ */
 
 
-#define        VERIFY  verify
-#define        ASSERT  assert
+#define        VERIFY  zverify
+#define        ASSERT  zverify
+#undef assert
+#define        assert  zverify
 
 extern void __assert(const char *, const char *, int);
 
@@ -130,7 +137,7 @@ extern void __assert(const char *, const char *, int);
 #define        VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
        const TYPE __left = (TYPE)(LEFT); \
        const TYPE __right = (TYPE)(RIGHT); \
-       if (!(__left OP __right)) { \
+       if (!(__left OP __right) && (!aok)) { \
                char *__buf = alloca(256); \
                (void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \
                        #LEFT, #OP, #RIGHT, \
@@ -196,6 +203,18 @@ typedef struct kthread kthread_t;
 #define        thread_create(stk, stksize, func, arg, len, pp, state, pri)     \
        zk_thread_create(func, arg)
 #define        thread_exit() thr_exit(NULL)
+#define        thread_join(t)  panic("libzpool cannot join threads")
+
+#define        newproc(f, a, cid, pri, ctp, pid)       (ENOSYS)
+
+/* in libzpool, p0 exists only to have its address taken */
+struct proc {
+       uintptr_t       this_is_never_used_dont_dereference_it;
+};
+
+extern struct proc p0;
+
+#define        PS_NONE         -1
 
 extern kthread_t *zk_thread_create(void (*func)(), void *arg);
 
@@ -318,20 +337,27 @@ typedef void (task_func_t)(void *);
 #define        TASKQ_PREPOPULATE       0x0001
 #define        TASKQ_CPR_SAFE          0x0002  /* Use CPR safe protocol */
 #define        TASKQ_DYNAMIC           0x0004  /* Use dynamic thread scheduling */
-#define        TASKQ_THREADS_CPU_PCT   0x0008  /* Use dynamic thread scheduling */
+#define        TASKQ_THREADS_CPU_PCT   0x0008  /* Scale # threads by # cpus */
+#define        TASKQ_DC_BATCH          0x0010  /* Mark threads as batch */
 
 #define        TQ_SLEEP        KM_SLEEP        /* Can block for memory */
 #define        TQ_NOSLEEP      KM_NOSLEEP      /* cannot block for memory; may fail */
-#define        TQ_NOQUEUE      0x02    /* Do not enqueue if can't dispatch */
+#define        TQ_NOQUEUE      0x02            /* Do not enqueue if can't dispatch */
+#define        TQ_FRONT        0x08            /* Queue in front */
 
 extern taskq_t *system_taskq;
 
 extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+#define        taskq_create_proc(a, b, c, d, e, p, f) \
+           (taskq_create(a, b, c, d, e, f))
+#define        taskq_create_sysdc(a, b, d, e, p, dc, f) \
+           (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void    taskq_destroy(taskq_t *);
 extern void    taskq_wait(taskq_t *);
 extern int     taskq_member(taskq_t *, void *);
 extern void    system_taskq_init(void);
+extern void    system_taskq_fini(void);
 
 #define        XVA_MAPSIZE     3
 #define        XVA_MAGIC       0x78766174
@@ -345,6 +371,7 @@ typedef struct vnode {
        char            *v_path;
 } vnode_t;
 
+#define        AV_SCANSTAMP_SZ 32              /* length of anti-virus scanstamp */
 
 typedef struct xoptattr {
        timestruc_t     xoa_createtime; /* Create time of file */
@@ -360,6 +387,8 @@ typedef struct xoptattr {
        uint8_t         xoa_opaque;
        uint8_t         xoa_av_quarantined;
        uint8_t         xoa_av_modified;
+       uint8_t         xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+       uint8_t         xoa_reparse;
 } xoptattr_t;
 
 typedef struct vattr {
@@ -406,9 +435,11 @@ typedef struct vsecattr {
 
 #define        CRCREAT         0
 
+extern int fop_getattr(vnode_t *vp, vattr_t *vap);
+
 #define        VOP_CLOSE(vp, f, c, o, cr, ct)  0
 #define        VOP_PUTPAGE(vp, of, sz, fl, cr, ct)     0
-#define        VOP_GETATTR(vp, vap, fl, cr, ct)  ((vap)->va_size = (vp)->v_size, 0)
+#define        VOP_GETATTR(vp, vap, fl, cr, ct)  fop_getattr((vp), (vap));
 
 #define        VOP_FSYNC(vp, f, cr, ct)        fsync((vp)->v_fd)
 
@@ -433,13 +464,18 @@ extern vnode_t *rootdir;
 /*
  * Random stuff
  */
-#define        lbolt   (gethrtime() >> 23)
-#define        lbolt64 (gethrtime() >> 23)
+#define        ddi_get_lbolt()         (gethrtime() >> 23)
+#define        ddi_get_lbolt64()       (gethrtime() >> 23)
 #define        hz      119     /* frequency when using gethrtime() >> 23 for lbolt */
 
 extern void delay(clock_t ticks);
 
 #define        gethrestime_sec() time(NULL)
+#define        gethrestime(t) \
+       do {\
+               (t)->tv_sec = gethrestime_sec();\
+               (t)->tv_nsec = 0;\
+       } while (0);
 
 #define        max_ncpus       64
 
@@ -490,6 +526,9 @@ typedef struct callb_cpr {
 #define        zone_dataset_visible(x, y)      (1)
 #define        INGLOBALZONE(z)                 (1)
 
+extern char *kmem_asprintf(const char *fmt, ...);
+#define        strfree(str) kmem_free((str), strlen(str)+1)
+
 /*
  * Hostname information
  */
@@ -497,6 +536,9 @@ extern char hw_serial[];    /* for userland-emulated hostid access */
 extern int ddi_strtoul(const char *str, char **nptr, int base,
     unsigned long *result);
 
+extern int ddi_strtoull(const char *str, char **nptr, int base,
+    u_longlong_t *result);
+
 /* ZFS Boot Related stuff. */
 
 struct _buf {
index 89108fe..5284c12 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,6 +42,7 @@
  * Emulation of kernel services in userland.
  */
 
+int aok;
 uint64_t physmem;
 vnode_t *rootdir = (vnode_t *)0xabcd1234;
 char hw_serial[HW_HOSTID_LEN];
@@ -50,6 +51,9 @@ struct utsname utsname = {
        "userland", "libzpool", "1", "1", "na"
 };
 
+/* this only exists to have its address taken */
+struct proc p0;
+
 /*
  * =========================================================================
  * threads
@@ -269,7 +273,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
        clock_t delta;
 
 top:
-       delta = abstime - lbolt;
+       delta = abstime - ddi_get_lbolt();
        if (delta <= 0)
                return (-1);
 
@@ -444,6 +448,24 @@ vn_close(vnode_t *vp)
        umem_free(vp, sizeof (vnode_t));
 }
 
+/*
+ * At a minimum we need to update the size since vdev_reopen()
+ * will no longer call vn_openat().
+ */
+int
+fop_getattr(vnode_t *vp, vattr_t *vap)
+{
+       struct stat64 st;
+
+       if (fstat64(vp->v_fd, &st) == -1) {
+               close(vp->v_fd);
+               return (errno);
+       }
+
+       vap->va_size = st.st_size;
+       return (0);
+}
+
 #ifdef ZFS_DEBUG
 
 /*
@@ -754,6 +776,17 @@ ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
        return (0);
 }
 
+int
+ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
+{
+       char *end;
+
+       *result = strtoull(str, &end, base);
+       if (*result == 0)
+               return (errno);
+       return (0);
+}
+
 /*
  * =========================================================================
  * kernel emulation setup & teardown
@@ -779,7 +812,8 @@ kernel_init(int mode)
        dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
            (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
 
-       (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
+       (void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
+           (mode & FWRITE) ? gethostid() : 0);
 
        VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
        VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
@@ -794,6 +828,8 @@ kernel_fini(void)
 {
        spa_fini();
 
+       system_taskq_fini();
+
        close(random_fd);
        close(urandom_fd);
 
@@ -884,3 +920,27 @@ ksiddomain_rele(ksiddomain_t *ksid)
        spa_strfree(ksid->kd_name);
        umem_free(ksid, sizeof (ksiddomain_t));
 }
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+       int size;
+       va_list adx;
+       char *buf;
+
+       va_start(adx, fmt);
+       size = vsnprintf(NULL, 0, fmt, adx) + 1;
+       va_end(adx);
+
+       buf = kmem_alloc(size, KM_SLEEP);
+
+       va_start(adx, fmt);
+       size = vsnprintf(buf, size, fmt, adx);
+       va_end(adx);
+
+       return (buf);
+}
index 1a73fe8..8db5d11 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -49,6 +49,8 @@ struct taskq {
        int             tq_nalloc;
        int             tq_minalloc;
        int             tq_maxalloc;
+       kcondvar_t      tq_maxalloc_cv;
+       int             tq_maxalloc_wait;
        task_t          *tq_freelist;
        task_t          tq_task;
 };
@@ -57,26 +59,36 @@ static task_t *
 task_alloc(taskq_t *tq, int tqflags)
 {
        task_t *t;
+       int rv;
 
-       if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
                tq->tq_freelist = t->task_next;
        } else {
-               mutex_exit(&tq->tq_lock);
                if (tq->tq_nalloc >= tq->tq_maxalloc) {
-                       if (!(tqflags & KM_SLEEP)) {
-                               mutex_enter(&tq->tq_lock);
+                       if (!(tqflags & KM_SLEEP))
                                return (NULL);
-                       }
+
                        /*
                         * We don't want to exceed tq_maxalloc, but we can't
                         * wait for other tasks to complete (and thus free up
                         * task structures) without risking deadlock with
                         * the caller.  So, we just delay for one second
-                        * to throttle the allocation rate.
+                        * to throttle the allocation rate. If we have tasks
+                        * complete before one second timeout expires then
+                        * taskq_ent_free will signal us and we will
+                        * immediately retry the allocation.
                         */
-                       delay(hz);
+                       tq->tq_maxalloc_wait++;
+                       rv = cv_timedwait(&tq->tq_maxalloc_cv,
+                           &tq->tq_lock, ddi_get_lbolt() + hz);
+                       tq->tq_maxalloc_wait--;
+                       if (rv > 0)
+                               goto again;             /* signaled */
                }
+               mutex_exit(&tq->tq_lock);
+
                t = kmem_alloc(sizeof (task_t), tqflags);
+
                mutex_enter(&tq->tq_lock);
                if (t != NULL)
                        tq->tq_nalloc++;
@@ -96,6 +108,9 @@ task_free(taskq_t *tq, task_t *t)
                kmem_free(t, sizeof (task_t));
                mutex_enter(&tq->tq_lock);
        }
+
+       if (tq->tq_maxalloc_wait)
+               cv_signal(&tq->tq_maxalloc_cv);
 }
 
 taskqid_t
@@ -114,8 +129,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
                mutex_exit(&tq->tq_lock);
                return (0);
        }
-       t->task_next = &tq->tq_task;
-       t->task_prev = tq->tq_task.task_prev;
+       if (tqflags & TQ_FRONT) {
+               t->task_next = tq->tq_task.task_next;
+               t->task_prev = &tq->tq_task;
+       } else {
+               t->task_next = &tq->tq_task;
+               t->task_prev = tq->tq_task.task_prev;
+       }
        t->task_next->task_prev = t;
        t->task_prev->task_next = t;
        t->task_func = func;
@@ -191,6 +211,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
        mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
        tq->tq_flags = flags | TASKQ_ACTIVE;
        tq->tq_active = nthreads;
        tq->tq_nthreads = nthreads;
@@ -247,6 +268,7 @@ taskq_destroy(taskq_t *tq)
        mutex_destroy(&tq->tq_lock);
        cv_destroy(&tq->tq_dispatch_cv);
        cv_destroy(&tq->tq_wait_cv);
+       cv_destroy(&tq->tq_maxalloc_cv);
 
        kmem_free(tq, sizeof (taskq_t));
 }
@@ -272,3 +294,10 @@ system_taskq_init(void)
        system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512,
            TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
 }
+
+void
+system_taskq_fini(void)
+{
+       taskq_destroy(system_taskq);
+       system_taskq = NULL; /* defensive */
+}
index 781edb6..9b99531 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <assert.h>
@@ -90,7 +89,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
                if (is_log)
                        prefix = "log ";
 
-               if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+               if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
                    (uint64_t **)&vs, &c) != 0)
                        vs = &v0;
 
index c9727c6..dd39c12 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
-
 /*
  * AVL - generic AVL tree implementation for kernel use
  *
@@ -243,7 +240,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
  *     "void *"  of the found tree node
  */
 void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
 {
        avl_node_t *node;
        avl_node_t *prev = NULL;
index 02263a5..ba305c9 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef        _AVL_H
 #define        _AVL_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 /*
  * This is a private header file.  Applications should not directly include
  * this file.
@@ -163,7 +161,7 @@ extern void avl_create(avl_tree_t *tree,
  * node   - node that has the value being looked for
  * where  - position for use with avl_nearest() or avl_insert(), may be NULL
  */
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
 
 /*
  * Insert a node into the tree.
index 9e76854..58037b0 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef        _SYS_NVPAIR_H
 #define        _SYS_NVPAIR_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/va_list.h>
@@ -199,6 +197,7 @@ int nvlist_add_double(nvlist_t *, const char *, double);
 
 int nvlist_remove(nvlist_t *, const char *, data_type_t);
 int nvlist_remove_all(nvlist_t *, const char *);
+int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);
 
 int nvlist_lookup_boolean(nvlist_t *, const char *);
 int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
@@ -237,9 +236,11 @@ int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
 int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
     int *, char **);
 boolean_t nvlist_exists(nvlist_t *, const char *);
+boolean_t nvlist_empty(nvlist_t *);
 
 /* processing nvpair */
 nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
+nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
 char *nvpair_name(nvpair_t *);
 data_type_t nvpair_type(nvpair_t *);
 int nvpair_type_is_array(nvpair_t *);
index 77891bf..8115091 100644 (file)
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/stropts.h>
 #include <sys/debug.h>
 #include <sys/isa_defs.h>
@@ -692,6 +690,18 @@ nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
        return (ENOENT);
 }
 
+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+       if (nvl == NULL || nvp == NULL)
+               return (EINVAL);
+
+       nvp_buf_unlink(nvl, nvp);
+       nvpair_free(nvp);
+       nvp_buf_free(nvl, nvp);
+       return (0);
+}
+
 /*
  * This function calculates the size of an nvpair value.
  *
@@ -1162,6 +1172,42 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
        return (curr != NULL ? &curr->nvi_nvp : NULL);
 }
 
+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+       nvpriv_t *priv;
+       i_nvp_t *curr;
+
+       if (nvl == NULL ||
+           (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+               return (NULL);
+
+       curr = NVPAIR2I_NVP(nvp);
+
+       if (nvp == NULL)
+               curr = priv->nvp_last;
+       else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+               curr = curr->nvi_prev;
+       else
+               curr = NULL;
+
+       priv->nvp_curr = curr;
+
+       return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+       nvpriv_t *priv;
+
+       if (nvl == NULL ||
+           (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+               return (B_TRUE);
+
+       return (priv->nvp_list == NULL);
+}
+
 char *
 nvpair_name(nvpair_t *nvp)
 {
index 86b36a8..3c95c91 100644 (file)
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef        _SYS_FS_ZFS_H
 #define        _SYS_FS_ZFS_H
 
@@ -86,12 +87,11 @@ typedef enum {
        ZFS_PROP_READONLY,
        ZFS_PROP_ZONED,
        ZFS_PROP_SNAPDIR,
-       ZFS_PROP_ACLMODE,
+       ZFS_PROP_PRIVATE,               /* not exposed to user, temporary */
        ZFS_PROP_ACLINHERIT,
        ZFS_PROP_CREATETXG,             /* not exposed to the user */
        ZFS_PROP_NAME,                  /* not exposed to the user */
        ZFS_PROP_CANMOUNT,
-       ZFS_PROP_SHAREISCSI,
        ZFS_PROP_ISCSIOPTIONS,          /* not exposed to the user */
        ZFS_PROP_XATTR,
        ZFS_PROP_NUMCLONES,             /* not exposed to the user */
@@ -116,6 +116,12 @@ typedef enum {
        ZFS_PROP_STMF_SHAREINFO,        /* not exposed to the user */
        ZFS_PROP_DEFER_DESTROY,
        ZFS_PROP_USERREFS,
+       ZFS_PROP_LOGBIAS,
+       ZFS_PROP_UNIQUE,                /* not exposed to the user */
+       ZFS_PROP_OBJSETID,              /* not exposed to the user */
+       ZFS_PROP_DEDUP,
+       ZFS_PROP_MLSLABEL,
+       ZFS_PROP_SYNC,
        ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -138,8 +144,6 @@ extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
 typedef enum {
        ZPOOL_PROP_NAME,
        ZPOOL_PROP_SIZE,
-       ZPOOL_PROP_USED,
-       ZPOOL_PROP_AVAILABLE,
        ZPOOL_PROP_CAPACITY,
        ZPOOL_PROP_ALTROOT,
        ZPOOL_PROP_HEALTH,
@@ -152,6 +156,10 @@ typedef enum {
        ZPOOL_PROP_FAILUREMODE,
        ZPOOL_PROP_LISTSNAPS,
        ZPOOL_PROP_AUTOEXPAND,
+       ZPOOL_PROP_DEDUPDITTO,
+       ZPOOL_PROP_DEDUPRATIO,
+       ZPOOL_PROP_FREE,
+       ZPOOL_PROP_ALLOCATED,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -166,10 +174,27 @@ typedef enum {
        ZPROP_SRC_DEFAULT = 0x2,
        ZPROP_SRC_TEMPORARY = 0x4,
        ZPROP_SRC_LOCAL = 0x8,
-       ZPROP_SRC_INHERITED = 0x10
+       ZPROP_SRC_INHERITED = 0x10,
+       ZPROP_SRC_RECEIVED = 0x20
 } zprop_source_t;
 
-#define        ZPROP_SRC_ALL   0x1f
+#define        ZPROP_SRC_ALL   0x3f
+
+#define        ZPROP_SOURCE_VAL_RECVD  "$recvd"
+#define        ZPROP_N_MORE_ERRORS     "N_MORE_ERRORS"
+/*
+ * Dataset flag implemented as a special entry in the props zap object
+ * indicating that the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
+ * just as it did in earlier versions, and thereafter, local properties are
+ * preserved.
+ */
+#define        ZPROP_HAS_RECVD         "$hasrecvd"
+
+typedef enum {
+       ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
+       ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
+} zprop_errflags_t;
 
 typedef int (*zprop_func)(int, void *);
 
@@ -191,9 +216,10 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
 const char *zfs_prop_to_name(zfs_prop_t);
 zfs_prop_t zfs_name_to_prop(const char *);
 boolean_t zfs_prop_user(const char *);
-boolean_t zfs_prop_userquota(const char *name);
+boolean_t zfs_prop_userquota(const char *);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
 
 /*
@@ -206,6 +232,7 @@ uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
 
 /*
  * Definitions for the Delegation.
@@ -236,6 +263,8 @@ typedef enum {
 #define        ZFS_DELEG_PERM_GID      "gid"
 #define        ZFS_DELEG_PERM_GROUPS   "groups"
 
+#define        ZFS_MLSLABEL_DEFAULT    "none"
+
 #define        ZFS_SMB_ACL_SRC         "src"
 #define        ZFS_SMB_ACL_TARGET      "target"
 
@@ -245,6 +274,11 @@ typedef enum {
        ZFS_CANMOUNT_NOAUTO = 2
 } zfs_canmount_type_t;
 
+typedef enum {
+       ZFS_LOGBIAS_LATENCY = 0,
+       ZFS_LOGBIAS_THROUGHPUT = 1
+} zfs_logbias_op_t;
+
 typedef enum zfs_share_op {
        ZFS_SHARE_NFS = 0,
        ZFS_UNSHARE_NFS = 1,
@@ -265,6 +299,12 @@ typedef enum zfs_cache_type {
        ZFS_CACHE_ALL = 2
 } zfs_cache_type_t;
 
+typedef enum {
+       ZFS_SYNC_STANDARD = 0,
+       ZFS_SYNC_ALWAYS = 1,
+       ZFS_SYNC_DISABLED = 2
+} zfs_sync_type_t;
+
 
 /*
  * On-disk version number.
@@ -287,14 +327,22 @@ typedef enum zfs_cache_type {
 #define        SPA_VERSION_16                  16ULL
 #define        SPA_VERSION_17                  17ULL
 #define        SPA_VERSION_18                  18ULL
+#define        SPA_VERSION_19                  19ULL
+#define        SPA_VERSION_20                  20ULL
+#define        SPA_VERSION_21                  21ULL
+#define        SPA_VERSION_22                  22ULL
+#define        SPA_VERSION_23                  23ULL
+#define        SPA_VERSION_24                  24ULL
+#define        SPA_VERSION_25                  25ULL
+#define        SPA_VERSION_26                  26ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define        SPA_VERSION                     SPA_VERSION_18
-#define        SPA_VERSION_STRING              "18"
+#define        SPA_VERSION                     SPA_VERSION_26
+#define        SPA_VERSION_STRING              "26"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -311,7 +359,7 @@ typedef enum zfs_cache_type {
 #define        SPA_VERSION_DITTO_BLOCKS        SPA_VERSION_2
 #define        SPA_VERSION_SPARES              SPA_VERSION_3
 #define        SPA_VERSION_RAIDZ2              SPA_VERSION_3
-#define        SPA_VERSION_BPLIST_ACCOUNT      SPA_VERSION_3
+#define        SPA_VERSION_BPOBJ_ACCOUNT       SPA_VERSION_3
 #define        SPA_VERSION_RAIDZ_DEFLATE       SPA_VERSION_3
 #define        SPA_VERSION_DNODE_BYTES         SPA_VERSION_3
 #define        SPA_VERSION_ZPOOL_HISTORY       SPA_VERSION_4
@@ -334,6 +382,15 @@ typedef enum zfs_cache_type {
 #define        SPA_VERSION_STMF_PROP           SPA_VERSION_16
 #define        SPA_VERSION_RAIDZ3              SPA_VERSION_17
 #define        SPA_VERSION_USERREFS            SPA_VERSION_18
+#define        SPA_VERSION_HOLES               SPA_VERSION_19
+#define        SPA_VERSION_ZLE_COMPRESSION     SPA_VERSION_20
+#define        SPA_VERSION_DEDUP               SPA_VERSION_21
+#define        SPA_VERSION_RECVD_PROPS         SPA_VERSION_22
+#define        SPA_VERSION_SLIM_ZIL            SPA_VERSION_23
+#define        SPA_VERSION_SA                  SPA_VERSION_24
+#define        SPA_VERSION_SCAN                SPA_VERSION_25
+#define        SPA_VERSION_DIR_CLONES          SPA_VERSION_26
+#define        SPA_VERSION_DEADLISTS           SPA_VERSION_26
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -347,8 +404,9 @@ typedef enum zfs_cache_type {
 #define        ZPL_VERSION_2                   2ULL
 #define        ZPL_VERSION_3                   3ULL
 #define        ZPL_VERSION_4                   4ULL
-#define        ZPL_VERSION                     ZPL_VERSION_4
-#define        ZPL_VERSION_STRING              "4"
+#define        ZPL_VERSION_5                   5ULL
+#define        ZPL_VERSION                     ZPL_VERSION_5
+#define        ZPL_VERSION_STRING              "5"
 
 #define        ZPL_VERSION_INITIAL             ZPL_VERSION_1
 #define        ZPL_VERSION_DIRENT_TYPE         ZPL_VERSION_2
@@ -356,6 +414,23 @@ typedef enum zfs_cache_type {
 #define        ZPL_VERSION_NORMALIZATION       ZPL_VERSION_3
 #define        ZPL_VERSION_SYSATTR             ZPL_VERSION_3
 #define        ZPL_VERSION_USERSPACE           ZPL_VERSION_4
+#define        ZPL_VERSION_SA                  ZPL_VERSION_5
+
+/* Rewind request information */
+#define        ZPOOL_NO_REWIND         1  /* No policy - default behavior */
+#define        ZPOOL_NEVER_REWIND      2  /* Do not search for best txg or rewind */
+#define        ZPOOL_TRY_REWIND        4  /* Search for best txg, but do not rewind */
+#define        ZPOOL_DO_REWIND         8  /* Rewind to best txg w/in deferred frees */
+#define        ZPOOL_EXTREME_REWIND    16 /* Allow extreme measures to find best txg */
+#define        ZPOOL_REWIND_MASK       28 /* All the possible rewind bits */
+#define        ZPOOL_REWIND_POLICIES   31 /* All the possible policy bits */
+
+typedef struct zpool_rewind_policy {
+       uint32_t        zrp_request;    /* rewind behavior requested */
+       uint64_t        zrp_maxmeta;    /* max acceptable meta-data errors */
+       uint64_t        zrp_maxdata;    /* max acceptable data errors */
+       uint64_t        zrp_txg;        /* specific txg to load */
+} zpool_rewind_policy_t;
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
@@ -380,7 +455,8 @@ typedef enum zfs_cache_type {
 #define        ZPOOL_CONFIG_ASHIFT             "ashift"
 #define        ZPOOL_CONFIG_ASIZE              "asize"
 #define        ZPOOL_CONFIG_DTL                "DTL"
-#define        ZPOOL_CONFIG_STATS              "stats"
+#define        ZPOOL_CONFIG_SCAN_STATS         "scan_stats"    /* not stored on disk */
+#define        ZPOOL_CONFIG_VDEV_STATS         "vdev_stats"    /* not stored on disk */
 #define        ZPOOL_CONFIG_WHOLE_DISK         "whole_disk"
 #define        ZPOOL_CONFIG_ERRCOUNT           "error_count"
 #define        ZPOOL_CONFIG_NOT_PRESENT        "not_present"
@@ -393,6 +469,17 @@ typedef enum zfs_cache_type {
 #define        ZPOOL_CONFIG_PHYS_PATH          "phys_path"
 #define        ZPOOL_CONFIG_IS_LOG             "is_log"
 #define        ZPOOL_CONFIG_L2CACHE            "l2cache"
+#define        ZPOOL_CONFIG_HOLE_ARRAY         "hole_array"
+#define        ZPOOL_CONFIG_VDEV_CHILDREN      "vdev_children"
+#define        ZPOOL_CONFIG_IS_HOLE            "is_hole"
+#define        ZPOOL_CONFIG_DDT_HISTOGRAM      "ddt_histogram"
+#define        ZPOOL_CONFIG_DDT_OBJ_STATS      "ddt_object_stats"
+#define        ZPOOL_CONFIG_DDT_STATS          "ddt_stats"
+#define        ZPOOL_CONFIG_SPLIT              "splitcfg"
+#define        ZPOOL_CONFIG_ORIG_GUID          "orig_guid"
+#define        ZPOOL_CONFIG_SPLIT_GUID         "split_guid"
+#define        ZPOOL_CONFIG_SPLIT_LIST         "guid_list"
+#define        ZPOOL_CONFIG_REMOVING           "removing"
 #define        ZPOOL_CONFIG_SUSPENDED          "suspended"     /* not stored on disk */
 #define        ZPOOL_CONFIG_TIMESTAMP          "timestamp"     /* not stored on disk */
 #define        ZPOOL_CONFIG_BOOTFS             "bootfs"        /* not stored on disk */
@@ -406,6 +493,19 @@ typedef enum zfs_cache_type {
 #define        ZPOOL_CONFIG_DEGRADED           "degraded"
 #define        ZPOOL_CONFIG_REMOVED            "removed"
 #define        ZPOOL_CONFIG_FRU                "fru"
+#define        ZPOOL_CONFIG_AUX_STATE          "aux_state"
+
+/* Rewind policy parameters */
+#define        ZPOOL_REWIND_POLICY             "rewind-policy"
+#define        ZPOOL_REWIND_REQUEST            "rewind-request"
+#define        ZPOOL_REWIND_REQUEST_TXG        "rewind-request-txg"
+#define        ZPOOL_REWIND_META_THRESH        "rewind-meta-thresh"
+#define        ZPOOL_REWIND_DATA_THRESH        "rewind-data-thresh"
+
+/* Rewind data discovered */
+#define        ZPOOL_CONFIG_LOAD_TIME          "rewind_txg_ts"
+#define        ZPOOL_CONFIG_LOAD_DATA_ERRORS   "verify_data_errors"
+#define        ZPOOL_CONFIG_REWIND_TIME        "seconds_of_rewind"
 
 #define        VDEV_TYPE_ROOT                  "root"
 #define        VDEV_TYPE_MIRROR                "mirror"
@@ -414,6 +514,7 @@ typedef enum zfs_cache_type {
 #define        VDEV_TYPE_DISK                  "disk"
 #define        VDEV_TYPE_FILE                  "file"
 #define        VDEV_TYPE_MISSING               "missing"
+#define        VDEV_TYPE_HOLE                  "hole"
 #define        VDEV_TYPE_SPARE                 "spare"
 #define        VDEV_TYPE_LOG                   "log"
 #define        VDEV_TYPE_L2CACHE               "l2cache"
@@ -463,7 +564,9 @@ typedef enum vdev_aux {
        VDEV_AUX_SPARED,        /* hot spare used in another pool       */
        VDEV_AUX_ERR_EXCEEDED,  /* too many errors                      */
        VDEV_AUX_IO_FAILURE,    /* experienced I/O failure              */
-       VDEV_AUX_BAD_LOG        /* cannot read log chain(s)             */
+       VDEV_AUX_BAD_LOG,       /* cannot read log chain(s)             */
+       VDEV_AUX_EXTERNAL,      /* external diagnosis                   */
+       VDEV_AUX_SPLIT_POOL     /* vdev was split off into another pool */
 } vdev_aux_t;
 
 /*
@@ -484,14 +587,14 @@ typedef enum pool_state {
 } pool_state_t;
 
 /*
- * Scrub types.
+ * Scan Functions.
  */
-typedef enum pool_scrub_type {
-       POOL_SCRUB_NONE,
-       POOL_SCRUB_RESILVER,
-       POOL_SCRUB_EVERYTHING,
-       POOL_SCRUB_TYPES
-} pool_scrub_type_t;
+typedef enum pool_scan_func {
+       POOL_SCAN_NONE,
+       POOL_SCAN_SCRUB,
+       POOL_SCAN_RESILVER,
+       POOL_SCAN_FUNCS
+} pool_scan_func_t;
 
 /*
  * ZIO types.  Needed to interpret vdev statistics below.
@@ -507,6 +610,36 @@ typedef enum zio_type {
 } zio_type_t;
 
 /*
+ * Pool statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+       /* values stored on disk */
+       uint64_t        pss_func;       /* pool_scan_func_t */
+       uint64_t        pss_state;      /* dsl_scan_state_t */
+       uint64_t        pss_start_time; /* scan start time */
+       uint64_t        pss_end_time;   /* scan end time */
+       uint64_t        pss_to_examine; /* total bytes to scan */
+       uint64_t        pss_examined;   /* total examined bytes */
+       uint64_t        pss_to_process; /* total bytes to process */
+       uint64_t        pss_processed;  /* total processed bytes */
+       uint64_t        pss_errors;     /* scan errors  */
+
+       /* values not stored on disk */
+       uint64_t        pss_pass_exam;  /* examined bytes per scan pass */
+       uint64_t        pss_pass_start; /* start time of a scan pass */
+} pool_scan_stat_t;
+
+typedef enum dsl_scan_state {
+       DSS_NONE,
+       DSS_SCANNING,
+       DSS_FINISHED,
+       DSS_CANCELED,
+       DSS_NUM_STATES
+} dsl_scan_state_t;
+
+
+/*
  * Vdev statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
@@ -524,34 +657,49 @@ typedef struct vdev_stat {
        uint64_t        vs_write_errors;        /* write errors         */
        uint64_t        vs_checksum_errors;     /* checksum errors      */
        uint64_t        vs_self_healed;         /* self-healed bytes    */
-       uint64_t        vs_scrub_type;          /* pool_scrub_type_t    */
-       uint64_t        vs_scrub_complete;      /* completed?           */
-       uint64_t        vs_scrub_examined;      /* bytes examined; top  */
-       uint64_t        vs_scrub_repaired;      /* bytes repaired; leaf */
-       uint64_t        vs_scrub_errors;        /* errors during scrub  */
-       uint64_t        vs_scrub_start;         /* UTC scrub start time */
-       uint64_t        vs_scrub_end;           /* UTC scrub end time   */
+       uint64_t        vs_scan_removing;       /* removing?    */
+       uint64_t        vs_scan_processed;      /* scan processed bytes */
 } vdev_stat_t;
 
+/*
+ * DDT statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct ddt_object {
+       uint64_t        ddo_count;      /* number of elments in ddt     */
+       uint64_t        ddo_dspace;     /* size of ddt on disk          */
+       uint64_t        ddo_mspace;     /* size of ddt in-core          */
+} ddt_object_t;
+
+typedef struct ddt_stat {
+       uint64_t        dds_blocks;     /* blocks                       */
+       uint64_t        dds_lsize;      /* logical size                 */
+       uint64_t        dds_psize;      /* physical size                */
+       uint64_t        dds_dsize;      /* deflated allocated size      */
+       uint64_t        dds_ref_blocks; /* referenced blocks            */
+       uint64_t        dds_ref_lsize;  /* referenced lsize * refcnt    */
+       uint64_t        dds_ref_psize;  /* referenced psize * refcnt    */
+       uint64_t        dds_ref_dsize;  /* referenced dsize * refcnt    */
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+       ddt_stat_t      ddh_stat[64];   /* power-of-two histogram buckets */
+} ddt_histogram_t;
+
 #define        ZVOL_DRIVER     "zvol"
 #define        ZFS_DRIVER      "zfs"
 #define        ZFS_DEV         "/dev/zfs"
 
-/*
- * zvol paths.  Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix.  Below are the names without /dev.
- */
-#define        ZVOL_DEV_DIR    "zvol/dsk"
-#define        ZVOL_RDEV_DIR   "zvol/rdsk"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
+/* general zvol path */
+#define        ZVOL_DIR                "/dev/zvol"
+/* expansion */
 #define        ZVOL_PSEUDO_DEV         "/devices/pseudo/zfs@0:"
-#define        ZVOL_FULL_DEV_DIR       "/dev/" ZVOL_DEV_DIR "/"
+/* for dump and swap */
+#define        ZVOL_FULL_DEV_DIR       ZVOL_DIR "/dsk/"
+#define        ZVOL_FULL_RDEV_DIR      ZVOL_DIR "/rdsk/"
 
 #define        ZVOL_PROP_NAME          "name"
+#define        ZVOL_DEFAULT_BLOCKSIZE  8192
 
 /*
  * /dev/zfs ioctl numbers.
@@ -566,7 +714,7 @@ typedef enum zfs_ioc {
        ZFS_IOC_POOL_CONFIGS,
        ZFS_IOC_POOL_STATS,
        ZFS_IOC_POOL_TRYIMPORT,
-       ZFS_IOC_POOL_SCRUB,
+       ZFS_IOC_POOL_SCAN,
        ZFS_IOC_POOL_FREEZE,
        ZFS_IOC_POOL_UPGRADE,
        ZFS_IOC_POOL_GET_HISTORY,
@@ -582,8 +730,6 @@ typedef enum zfs_ioc {
        ZFS_IOC_DATASET_LIST_NEXT,
        ZFS_IOC_SNAPSHOT_LIST_NEXT,
        ZFS_IOC_SET_PROP,
-       ZFS_IOC_CREATE_MINOR,
-       ZFS_IOC_REMOVE_MINOR,
        ZFS_IOC_CREATE,
        ZFS_IOC_DESTROY,
        ZFS_IOC_ROLLBACK,
@@ -604,7 +750,6 @@ typedef enum zfs_ioc {
        ZFS_IOC_POOL_GET_PROPS,
        ZFS_IOC_SET_FSACL,
        ZFS_IOC_GET_FSACL,
-       ZFS_IOC_ISCSI_PERM_CHECK,
        ZFS_IOC_SHARE,
        ZFS_IOC_INHERIT_PROP,
        ZFS_IOC_SMB_ACL,
@@ -613,17 +758,21 @@ typedef enum zfs_ioc {
        ZFS_IOC_USERSPACE_UPGRADE,
        ZFS_IOC_HOLD,
        ZFS_IOC_RELEASE,
-       ZFS_IOC_GET_HOLDS
+       ZFS_IOC_GET_HOLDS,
+       ZFS_IOC_OBJSET_RECVD_PROPS,
+       ZFS_IOC_VDEV_SPLIT
 } zfs_ioc_t;
 
 /*
  * Internal SPA load state.  Used by FMA diagnosis engine.
  */
 typedef enum {
-       SPA_LOAD_NONE,          /* no load in progress */
-       SPA_LOAD_OPEN,          /* normal open */
-       SPA_LOAD_IMPORT,        /* import in progress */
-       SPA_LOAD_TRYIMPORT      /* tryimport in progress */
+       SPA_LOAD_NONE,          /* no load in progress  */
+       SPA_LOAD_OPEN,          /* normal open          */
+       SPA_LOAD_IMPORT,        /* import in progress   */
+       SPA_LOAD_TRYIMPORT,     /* tryimport in progress */
+       SPA_LOAD_RECOVER,       /* recovery requested   */
+       SPA_LOAD_ERROR          /* load failed          */
 } spa_load_state_t;
 
 /*
@@ -686,7 +835,7 @@ typedef enum {
 /*
  * Note: This is encoded on-disk, so new events must be added to the
  * end, and unused events can not be removed.  Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
  */
 typedef enum history_internal_events {
        LOG_NO_EVENT = 0,
@@ -703,7 +852,7 @@ typedef enum history_internal_events {
        LOG_POOL_VDEV_OFFLINE,
        LOG_POOL_UPGRADE,
        LOG_POOL_CLEAR,
-       LOG_POOL_SCRUB,
+       LOG_POOL_SCAN,
        LOG_POOL_PROPSET,
        LOG_DS_CREATE,
        LOG_DS_CLONE,
@@ -726,9 +875,10 @@ typedef enum history_internal_events {
        LOG_DS_UPGRADE,
        LOG_DS_REFQUOTA,
        LOG_DS_REFRESERV,
-       LOG_POOL_SCRUB_DONE,
+       LOG_POOL_SCAN_DONE,
        LOG_DS_USER_HOLD,
        LOG_DS_USER_RELEASE,
+       LOG_POOL_SPLIT,
        LOG_END
 } history_internal_events_t;
 
index f517044..61327f9 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _ZFS_COMUTIL_H
 #define        _ZFS_COMUTIL_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
 extern "C" {
 #endif
 
-extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+extern boolean_t zfs_allocatable_devs(nvlist_t *);
+extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
+
+extern int zfs_zpl_version_map(int spa_version);
+extern int zfs_spa_version_map(int zpl_version);
+extern const char *zfs_history_event_names[LOG_END];
 
 #ifdef __cplusplus
 }
diff --git a/module/zcommon/include/zfs_fletcher.h b/module/zcommon/include/zfs_fletcher.h
new file mode 100644 (file)
index 0000000..b49df0c
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _ZFS_FLETCHER_H
+#define        _ZFS_FLETCHER_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * fletcher checksum functions
+ */
+
+void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_incremental_native(const void *, uint64_t,
+    zio_cksum_t *);
+void fletcher_4_incremental_byteswap(const void *, uint64_t,
+    zio_cksum_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_FLETCHER_H */
index da5ae43..a632623 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef        _ZFS_PROP_H
 #define        _ZFS_PROP_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
@@ -79,6 +77,7 @@ typedef struct {
                                        /* "zfs get" help message */
        const zprop_index_t *pd_table;  /* for index properties, a table */
                                        /* defining the possible values */
+       size_t pd_table_size;           /* number of entries in pd_table[] */
 } zprop_desc_t;
 
 /*
@@ -99,16 +98,16 @@ zprop_desc_t *zpool_prop_get_table(void);
 /*
  * Common routines to initialize property tables
  */
-void register_impl(int, const char *, zprop_type_t, uint64_t,
+void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
     const char *, zprop_attr_t, int, const char *, const char *,
     boolean_t, boolean_t, const zprop_index_t *);
-void register_string(int, const char *, const char *, zprop_attr_t attr,
-    int, const char *, const char *);
-void register_number(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_string(int, const char *, const char *,
+    zprop_attr_t attr, int, const char *, const char *);
+void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
     const char *, const char *);
-void register_index(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
     const char *, const char *, const zprop_index_t *);
-void register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
     int, const char *);
 
 /*
@@ -118,6 +117,7 @@ int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
 int zprop_name_to_prop(const char *, zfs_type_t);
 int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
 int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
 const char *zprop_values(int, zfs_type_t);
 size_t zprop_width(int, boolean_t *, zfs_type_t);
 boolean_t zprop_valid_for_type(int, zfs_type_t);
index 74517a3..ed9b67e 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 /*
  * This file is intended for functions that ought to be common between user
  * land (libzfs) and the kernel. When many common routines need to be shared
 
 #if defined(_KERNEL)
 #include <sys/systm.h>
+#else
+#include <string.h>
 #endif
 
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
+#include <sys/int_limits.h>
 #include <sys/nvpair.h>
+#include "zfs_comutil.h"
 
 /*
  * Are there allocatable vdevs?
@@ -63,3 +64,139 @@ zfs_allocatable_devs(nvlist_t *nv)
        }
        return (B_FALSE);
 }
+
+void
+zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp)
+{
+       nvlist_t *policy;
+       nvpair_t *elem;
+       char *nm;
+
+       /* Defaults */
+       zrpp->zrp_request = ZPOOL_NO_REWIND;
+       zrpp->zrp_maxmeta = 0;
+       zrpp->zrp_maxdata = UINT64_MAX;
+       zrpp->zrp_txg = UINT64_MAX;
+
+       if (nvl == NULL)
+               return;
+
+       elem = NULL;
+       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+               nm = nvpair_name(elem);
+               if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) {
+                       if (nvpair_value_nvlist(elem, &policy) == 0)
+                               zpool_get_rewind_policy(policy, zrpp);
+                       return;
+               } else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) {
+                       if (nvpair_value_uint32(elem, &zrpp->zrp_request) == 0)
+                               if (zrpp->zrp_request & ~ZPOOL_REWIND_POLICIES)
+                                       zrpp->zrp_request = ZPOOL_NO_REWIND;
+               } else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) {
+                       (void) nvpair_value_uint64(elem, &zrpp->zrp_txg);
+               } else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) {
+                       (void) nvpair_value_uint64(elem, &zrpp->zrp_maxmeta);
+               } else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) {
+                       (void) nvpair_value_uint64(elem, &zrpp->zrp_maxdata);
+               }
+       }
+       if (zrpp->zrp_request == 0)
+               zrpp->zrp_request = ZPOOL_NO_REWIND;
+}
+
+typedef struct zfs_version_spa_map {
+       int     version_zpl;
+       int     version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+       {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+       {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+       {ZPL_VERSION_FUID, SPA_VERSION_FUID},
+       {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+       {ZPL_VERSION_SA, SPA_VERSION_SA},
+       {0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+       int i;
+       int version = -1;
+
+       for (i = 0; zfs_version_table[i].version_spa; i++) {
+               if (spa_version >= zfs_version_table[i].version_spa)
+                       version = zfs_version_table[i].version_zpl;
+       }
+
+       return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+       int i;
+       int version = -1;
+
+       for (i = 0; zfs_version_table[i].version_zpl; i++) {
+               if (zfs_version_table[i].version_zpl >= zpl_version)
+                       return (zfs_version_table[i].version_spa);
+       }
+
+       return (version);
+}
+
+const char *zfs_history_event_names[LOG_END] = {
+       "invalid event",
+       "pool create",
+       "vdev add",
+       "pool remove",
+       "pool destroy",
+       "pool export",
+       "pool import",
+       "vdev attach",
+       "vdev replace",
+       "vdev detach",
+       "vdev online",
+       "vdev offline",
+       "vdev upgrade",
+       "pool clear",
+       "pool scrub",
+       "pool property set",
+       "create",
+       "clone",
+       "destroy",
+       "destroy_begin_sync",
+       "inherit",
+       "property set",
+       "quota set",
+       "permission update",
+       "permission remove",
+       "permission who remove",
+       "promote",
+       "receive",
+       "rename",
+       "reservation set",
+       "replay_inc_sync",
+       "replay_full_sync",
+       "rollback",
+       "snapshot",
+       "filesystem version upgrade",
+       "refquota set",
+       "refreservation set",
+       "pool scrub done",
+       "user hold",
+       "user release",
+       "pool split",
+};
similarity index 99%
rename from module/zfs/fletcher.c
rename to module/zcommon/zfs_fletcher.c
index 54247d7..fa43ce6 100644 (file)
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/byteorder.h>
+#include <sys/zio.h>
 #include <sys/spa.h>
 
 void
index 6a32846..f29bcf6 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/u8_textprep.h>
@@ -69,6 +70,16 @@ zfs_prop_init(void)
                { NULL }
        };
 
+       static zprop_index_t dedup_table[] = {
+               { "on",         ZIO_CHECKSUM_ON },
+               { "off",        ZIO_CHECKSUM_OFF },
+               { "verify",     ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+               { "sha256",     ZIO_CHECKSUM_SHA256 },
+               { "sha256,verify",
+                               ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+               { NULL }
+       };
+
        static zprop_index_t compress_table[] = {
                { "on",         ZIO_COMPRESS_ON },
                { "off",        ZIO_COMPRESS_OFF },
@@ -83,6 +94,7 @@ zfs_prop_init(void)
                { "gzip-7",     ZIO_COMPRESS_GZIP_7 },
                { "gzip-8",     ZIO_COMPRESS_GZIP_8 },
                { "gzip-9",     ZIO_COMPRESS_GZIP_9 },
+               { "zle",        ZIO_COMPRESS_ZLE },
                { NULL }
        };
 
@@ -92,13 +104,6 @@ zfs_prop_init(void)
                { NULL }
        };
 
-       static zprop_index_t acl_mode_table[] = {
-               { "discard",    ZFS_ACL_DISCARD },
-               { "groupmask",  ZFS_ACL_GROUPMASK },
-               { "passthrough", ZFS_ACL_PASSTHROUGH },
-               { NULL }
-       };
-
        static zprop_index_t acl_inherit_table[] = {
                { "discard",    ZFS_ACL_DISCARD },
                { "noallow",    ZFS_ACL_NOALLOW },
@@ -142,6 +147,7 @@ zfs_prop_init(void)
                { "2",          2 },
                { "3",          3 },
                { "4",          4 },
+               { "5",          5 },
                { "current",    ZPL_VERSION },
                { NULL }
        };
@@ -152,6 +158,12 @@ zfs_prop_init(void)
                { NULL }
        };
 
+       static zprop_index_t logbias_table[] = {
+               { "latency",    ZFS_LOGBIAS_LATENCY },
+               { "throughput", ZFS_LOGBIAS_THROUGHPUT },
+               { NULL }
+       };
+
        static zprop_index_t canmount_table[] = {
                { "off",        ZFS_CANMOUNT_OFF },
                { "on",         ZFS_CANMOUNT_ON },
@@ -166,170 +178,208 @@ zfs_prop_init(void)
                { NULL }
        };
 
+       static zprop_index_t sync_table[] = {
+               { "standard",   ZFS_SYNC_STANDARD },
+               { "always",     ZFS_SYNC_ALWAYS },
+               { "disabled",   ZFS_SYNC_DISABLED },
+               { NULL }
+       };
+
        /* inherit index properties */
-       register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT,
+       zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
            PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "standard | always | disabled", "SYNC",
+           sync_table);
+       zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+           ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+           ZFS_TYPE_VOLUME,
            "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
            checksum_table);
-       register_index(ZFS_PROP_COMPRESSION, "compression",
+       zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+           PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "on | off | verify | sha256[,verify]", "DEDUP",
+           dedup_table);
+       zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
            ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-           "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
-       register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+           "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
+           compress_table);
+       zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
            PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
            "hidden | visible", "SNAPDIR", snapdir_table);
-       register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK,
-           PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-           "discard | groupmask | passthrough", "ACLMODE", acl_mode_table);
-       register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED,
-           PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+       zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+           ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
            "discard | noallow | restricted | passthrough | passthrough-x",
            "ACLINHERIT", acl_inherit_table);
-       register_index(ZFS_PROP_COPIES, "copies", 1,
-           PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+       zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+           ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
            "1 | 2 | 3", "COPIES", copies_table);
-       register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+       zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
            ZFS_CACHE_ALL, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
            "all | none | metadata", "PRIMARYCACHE", cache_table);
-       register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+       zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
            ZFS_CACHE_ALL, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
            "all | none | metadata", "SECONDARYCACHE", cache_table);
+       zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+           PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "latency | throughput", "LOGBIAS", logbias_table);
 
        /* inherit index (boolean) properties */
-       register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
-       register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
            boolean_table);
-       register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
            boolean_table);
-       register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
            boolean_table);
-       register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
            boolean_table);
-       register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
-       register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
            boolean_table);
-       register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
            boolean_table);
-       register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+       zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
            boolean_table);
 
        /* default index properties */
-       register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+       zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
            "1 | 2 | 3 | 4 | current", "VERSION", version_table);
-       register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+       zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
            PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
            "CANMOUNT", canmount_table);
 
        /* readonly index (boolean) properties */
-       register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+       zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
            ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
-       register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+       zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
            PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
            boolean_table);
 
        /* set once index properties */
-       register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+       zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
            PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
            "none | formC | formD | formKC | formKD", "NORMALIZATION",
            normalize_table);
-       register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE,
-           PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+       zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+           ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+           ZFS_TYPE_SNAPSHOT,
            "sensitive | insensitive | mixed", "CASE", case_table);
 
        /* set once index (boolean) properties */
-       register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+       zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
            "on | off", "UTF8ONLY", boolean_table);
 
        /* string properties */
-       register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+       zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
-       register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT,
-           ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT");
-       register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT,
-           ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS");
-       register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT,
-           ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI");
-       register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+       zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+           PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+           "MOUNTPOINT");
+       zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+           PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
+           "SHARENFS");
+       zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
            ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
-       register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
-           ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
+       zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+           PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+           "on | off | sharemgr(1M) options", "SHARESMB");
+       zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+           ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+           "<sensitivity label>", "MLSLABEL");
 
        /* readonly number properties */
-       register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+       zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
            ZFS_TYPE_DATASET, "<size>", "USED");
-       register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+       zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
-       register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY,
-           ZFS_TYPE_DATASET, "<size>", "REFER");
-       register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+       zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+           PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
+       zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
            PROP_READONLY, ZFS_TYPE_DATASET,
            "<1.00x or higher if compressed>", "RATIO");
-       register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192,
-           PROP_ONETIME,
+       zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+           ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
            ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK");
-       register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
-           ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
-       register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY,
-           ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS");
-       register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY,
-           ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD");
-       register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+       zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+           PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+           "USEDSNAP");
+       zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+           PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+           "USEDDS");
+       zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+           PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+           "USEDCHILD");
+       zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
            PROP_READONLY,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
-       register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+       zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
            ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
 
        /* default number properties */
-       register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+       zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
            ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
-       register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT,
-           ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
-       register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+       zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+           PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "<size> | none", "RESERV");
+       zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
            ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
-       register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+       zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
            ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
-       register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+       zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
            PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
            "<size> | none", "REFRESERV");
 
        /* inherit number properties */
-       register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
-           PROP_INHERIT,
+       zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+           SPA_MAXBLOCKSIZE, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
 
        /* hidden properties */
-       register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
-           PROP_READONLY, ZFS_TYPE_DATASET, NULL);
-       register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
-           PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
-       register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+       zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
+           PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
+       zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+           PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
+       zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
            PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
-       register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
-           PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
-       register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+       zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+           PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+       zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
            PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
            "STMF_SBD_LU");
-       register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
-           ZFS_TYPE_DATASET, "GUID");
-       register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
-           PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+       zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
+           PROP_READONLY, ZFS_TYPE_DATASET, "GUID");
+       zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+           PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+           "USERACCOUNTING");
+       zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+           PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+       zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
+           PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
+
+       /*
+        * Property to be removed once libbe is integrated
+        */
+       zprop_register_hidden(ZFS_PROP_PRIVATE, "priv_prop",
+           PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_FILESYSTEM,
+           "PRIV_PROP");
 
        /* oddball properties */
-       register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
-           PROP_READONLY, ZFS_TYPE_DATASET,
+       zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+           NULL, PROP_READONLY, ZFS_TYPE_DATASET,
            "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
 }
 
@@ -337,6 +387,11 @@ boolean_t
 zfs_prop_delegatable(zfs_prop_t prop)
 {
        zprop_desc_t *pd = &zfs_prop_table[prop];
+
+       /* The mlslabel property is never delegatable. */
+       if (prop == ZFS_PROP_MLSLABEL)
+               return (B_FALSE);
+
        return (pd->pd_attr != PROP_READONLY);
 }
 
@@ -421,6 +476,12 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
        return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
 }
 
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+       return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
 /*
  * Returns TRUE if the property applies to any of the given dataset types.
  */
index d57dcfb..0b8a952 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -64,48 +64,55 @@ zpool_prop_init(void)
        };
 
        /* string properties */
-       register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+       zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
            ZFS_TYPE_POOL, "<path>", "ALTROOT");
-       register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+       zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
            ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
-       register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT,
-           ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+       zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
 
        /* readonly number properties */
-       register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+       zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<size>", "SIZE");
-       register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY,
-           ZFS_TYPE_POOL, "<size>", "USED");
-       register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY,
-           ZFS_TYPE_POOL, "<size>", "AVAIL");
-       register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+       zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+           ZFS_TYPE_POOL, "<size>", "FREE");
+       zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+           PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+       zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<size>", "CAP");
-       register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+       zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<guid>", "GUID");
-       register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+       zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<state>", "HEALTH");
+       zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+           PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+           "DEDUP");
 
        /* default number properties */
-       register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+       zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
            PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+       zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
 
        /* default index (boolean) properties */
-       register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
-           ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table);
-       register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT,
-           ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
-       register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
-           ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
-       register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT,
-           ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+       zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+           boolean_table);
+       zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+       zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+           boolean_table);
+       zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
 
        /* default index properties */
-       register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+       zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
            ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
            "wait | continue | panic", "FAILMODE", failuremode_table);
 
        /* hidden properties */
-       register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+       zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
            PROP_READONLY, ZFS_TYPE_POOL, "NAME");
 }
 
@@ -166,6 +173,12 @@ zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
        return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
 }
 
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+       return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
 #ifndef _KERNEL
 
 const char *
index 85f55c2..0bbf20d 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -65,7 +65,7 @@ zprop_get_numprops(zfs_type_t type)
 }
 
 void
-register_impl(int prop, const char *name, zprop_type_t type,
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
     uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
     int objset_types, const char *values, const char *colname,
     boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
@@ -76,6 +76,8 @@ register_impl(int prop, const char *name, zprop_type_t type,
        pd = &prop_tbl[prop];
 
        ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+       ASSERT(name != NULL);
+       ASSERT(colname != NULL);
 
        pd->pd_name = name;
        pd->pd_propnum = prop;
@@ -89,40 +91,44 @@ register_impl(int prop, const char *name, zprop_type_t type,
        pd->pd_rightalign = rightalign;
        pd->pd_visible = visible;
        pd->pd_table = idx_tbl;
+       pd->pd_table_size = 0;
+       while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+               pd->pd_table_size++;
 }
 
 void
-register_string(int prop, const char *name, const char *def,
+zprop_register_string(int prop, const char *name, const char *def,
     zprop_attr_t attr, int objset_types, const char *values,
     const char *colname)
 {
-       register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+       zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
            objset_types, values, colname, B_FALSE, B_TRUE, NULL);
 
 }
 
 void
-register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname)
+zprop_register_number(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname)
 {
-       register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+       zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
            objset_types, values, colname, B_TRUE, B_TRUE, NULL);
 }
 
 void
-register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname,
-    const zprop_index_t *idx_tbl)
+zprop_register_index(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname, const zprop_index_t *idx_tbl)
 {
-       register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+       zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
            objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
 }
 
 void
-register_hidden(int prop, const char *name, zprop_type_t type,
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
     zprop_attr_t attr, int objset_types, const char *colname)
 {
-       register_impl(prop, name, type, 0, NULL, attr,
+       zprop_register_impl(prop, name, type, 0, NULL, attr,
            objset_types, NULL, colname, B_FALSE, B_FALSE, NULL);
 }
 
@@ -307,6 +313,25 @@ zprop_index_to_string(int prop, uint64_t index, const char **string,
        return (-1);
 }
 
+/*
+ * Return a random valid property value.  Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+       zprop_desc_t *prop_tbl;
+       const zprop_index_t *idx_tbl;
+
+       ASSERT((uint_t)prop < zprop_get_numprops(type));
+       prop_tbl = zprop_get_proptable(type);
+       idx_tbl = prop_tbl[prop].pd_table;
+
+       if (idx_tbl == NULL)
+               return (seed);
+
+       return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
 const char *
 zprop_values(int prop, zfs_type_t type)
 {
index d5e5aa5..8adb54d 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 
 #include <sys/spa.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <zfs_fletcher.h>
 
 static kmutex_t                arc_reclaim_thr_lock;
 static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
@@ -178,7 +177,6 @@ static boolean_t arc_warm;
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
@@ -249,6 +247,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_recycle_miss;
        kstat_named_t arcstat_mutex_miss;
        kstat_named_t arcstat_evict_skip;
+       kstat_named_t arcstat_evict_l2_cached;
+       kstat_named_t arcstat_evict_l2_eligible;
+       kstat_named_t arcstat_evict_l2_ineligible;
        kstat_named_t arcstat_hash_elements;
        kstat_named_t arcstat_hash_elements_max;
        kstat_named_t arcstat_hash_collisions;
@@ -302,6 +303,9 @@ static arc_stats_t arc_stats = {
        { "recycle_miss",               KSTAT_DATA_UINT64 },
        { "mutex_miss",                 KSTAT_DATA_UINT64 },
        { "evict_skip",                 KSTAT_DATA_UINT64 },
+       { "evict_l2_cached",            KSTAT_DATA_UINT64 },
+       { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
+       { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
        { "hash_elements",              KSTAT_DATA_UINT64 },
        { "hash_elements_max",          KSTAT_DATA_UINT64 },
        { "hash_collisions",            KSTAT_DATA_UINT64 },
@@ -341,7 +345,7 @@ static arc_stats_t arc_stats = {
 #define        ARCSTAT_INCR(stat, val) \
        atomic_add_64(&arc_stats.stat.value.ui64, (val));
 
-#define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
+#define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 #define        ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 
 #define        ARCSTAT_MAX(stat, val) {                                        \
@@ -375,7 +379,7 @@ static arc_stats_t arc_stats = {
        }
 
 kstat_t                        *arc_ksp;
-static arc_state_t     *arc_anon;
+static arc_state_t     *arc_anon;
 static arc_state_t     *arc_mru;
 static arc_state_t     *arc_mru_ghost;
 static arc_state_t     *arc_mfu;
@@ -432,6 +436,7 @@ struct arc_buf_hdr {
 
        kmutex_t                b_freeze_lock;
        zio_cksum_t             *b_freeze_cksum;
+       void                    *b_thawed;
 
        arc_buf_hdr_t           *b_hash_next;
        arc_buf_t               *b_buf;
@@ -468,6 +473,8 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+
 #define        GHOST_STATE(state)      \
        ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
        (state) == arc_l2c_only)
@@ -490,7 +497,6 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 #define        ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 #define        ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 #define        ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
-#define        ARC_STORED              (1 << 19)       /* has been store()d to */
 
 #define        HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -539,8 +545,8 @@ static buf_hash_table_t buf_hash_table;
        (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define        BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define        BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define        HDR_LOCK(buf) \
-       (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define        HDR_LOCK(hdr) \
+       (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
@@ -658,6 +664,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
        ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
        ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+       hdr->b_dva.dva_word[0] = 0;
+       hdr->b_dva.dva_word[1] = 0;
+       hdr->b_birth = 0;
+       hdr->b_cksum0 = 0;
+}
+
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
@@ -791,7 +806,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
        arc_buf_t *buf = vbuf;
 
        bzero(buf, sizeof (arc_buf_t));
-       rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+       mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+       rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
        arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
        return (0);
@@ -807,6 +823,7 @@ hdr_dest(void *vbuf, void *unused)
 {
        arc_buf_hdr_t *buf = vbuf;
 
+       ASSERT(BUF_EMPTY(buf));
        refcount_destroy(&buf->b_refcnt);
        cv_destroy(&buf->b_cv);
        mutex_destroy(&buf->b_freeze_lock);
@@ -819,7 +836,8 @@ buf_dest(void *vbuf, void *unused)
 {
        arc_buf_t *buf = vbuf;
 
-       rw_destroy(&buf->b_lock);
+       mutex_destroy(&buf->b_evict_lock);
+       rw_destroy(&buf->b_data_lock);
        arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -934,6 +952,11 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
+       kmutex_t *hash_lock;
+
+       hash_lock = HDR_LOCK(buf->b_hdr);
+       mutex_enter(hash_lock);
+
        if (zfs_flags & ZFS_DEBUG_MODIFY) {
                if (buf->b_hdr->b_state != arc_anon)
                        panic("modifying non-anon buffer!");
@@ -947,18 +970,32 @@ arc_buf_thaw(arc_buf_t *buf)
                kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                buf->b_hdr->b_freeze_cksum = NULL;
        }
+
+       if (zfs_flags & ZFS_DEBUG_MODIFY) {
+               if (buf->b_hdr->b_thawed)
+                       kmem_free(buf->b_hdr->b_thawed, 1);
+               buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+       }
+
        mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_exit(hash_lock);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+       kmutex_t *hash_lock;
+
        if (!(zfs_flags & ZFS_DEBUG_MODIFY))
                return;
 
+       hash_lock = HDR_LOCK(buf->b_hdr);
+       mutex_enter(hash_lock);
+
        ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
            buf->b_hdr->b_state == arc_anon);
        arc_cksum_compute(buf, B_FALSE);
+       mutex_exit(hash_lock);
 }
 
 static void
@@ -1030,6 +1067,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
        ASSERT(new_state != old_state);
        ASSERT(refcnt == 0 || ab->b_datacnt > 0);
        ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+       ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
        from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
@@ -1050,7 +1088,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 
                        /*
                         * If prefetching out of the ghost cache,
-                        * we will have a non-null datacnt.
+                        * we will have a non-zero datacnt.
                         */
                        if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
                                /* ghost elements have a ghost size */
@@ -1086,9 +1124,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
        }
 
        ASSERT(!BUF_EMPTY(ab));
-       if (new_state == arc_anon) {
+       if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
                buf_hash_remove(ab);
-       }
 
        /* adjust state sizes */
        if (to_delta)
@@ -1232,14 +1269,29 @@ arc_return_buf(arc_buf_t *buf, void *tag)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
-       ASSERT(hdr->b_state == arc_anon);
        ASSERT(buf->b_data != NULL);
-       VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
-       VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+       (void) refcount_add(&hdr->b_refcnt, tag);
+       (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
 
        atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
 
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+       arc_buf_hdr_t *hdr;
+
+       ASSERT(buf->b_data != NULL);
+       hdr = buf->b_hdr;
+       (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+       (void) refcount_remove(&hdr->b_refcnt, tag);
+       buf->b_efunc = NULL;
+       buf->b_private = NULL;
+
+       atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
@@ -1247,6 +1299,8 @@ arc_buf_clone(arc_buf_t *from)
        arc_buf_hdr_t *hdr = from->b_hdr;
        uint64_t size = hdr->b_size;
 
+       ASSERT(hdr->b_state != arc_anon);
+
        buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
        buf->b_hdr = hdr;
        buf->b_data = NULL;
@@ -1271,16 +1325,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
         * must verify b_data != NULL to know if the add_ref
         * was successful.
         */
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        if (buf->b_data == NULL) {
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                return;
        }
-       hdr = buf->b_hdr;
-       ASSERT(hdr != NULL);
-       hash_lock = HDR_LOCK(hdr);
+       hash_lock = HDR_LOCK(buf->b_hdr);
        mutex_enter(hash_lock);
-       rw_exit(&buf->b_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+       mutex_exit(&buf->b_evict_lock);
 
        ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
        add_reference(hdr, hash_lock, tag);
@@ -1328,6 +1382,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                arc_buf_contents_t type = buf->b_hdr->b_type;
 
                arc_cksum_verify(buf);
+
                if (!recycle) {
                        if (type == ARC_BUFC_METADATA) {
                                arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1365,6 +1420,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
        for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
                continue;
        *bufp = buf->b_next;
+       buf->b_next = NULL;
 
        ASSERT(buf->b_efunc == NULL);
 
@@ -1379,55 +1435,55 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
        ASSERT(refcount_is_zero(&hdr->b_refcnt));
        ASSERT3P(hdr->b_state, ==, arc_anon);
        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-       ASSERT(!(hdr->b_flags & ARC_STORED));
+       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
 
-       if (hdr->b_l2hdr != NULL) {
-               if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
-                       /*
-                        * To prevent arc_free() and l2arc_evict() from
-                        * attempting to free the same buffer at the same time,
-                        * a FREE_IN_PROGRESS flag is given to arc_free() to
-                        * give it priority.  l2arc_evict() can't destroy this
-                        * header while we are waiting on l2arc_buflist_mtx.
-                        *
-                        * The hdr may be removed from l2ad_buflist before we
-                        * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
-                        */
+       if (l2hdr != NULL) {
+               boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+               /*
+                * To prevent arc_free() and l2arc_evict() from
+                * attempting to free the same buffer at the same time,
+                * a FREE_IN_PROGRESS flag is given to arc_free() to
+                * give it priority.  l2arc_evict() can't destroy this
+                * header while we are waiting on l2arc_buflist_mtx.
+                *
+                * The hdr may be removed from l2ad_buflist before we
+                * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+                */
+               if (!buflist_held) {
                        mutex_enter(&l2arc_buflist_mtx);
-                       if (hdr->b_l2hdr != NULL) {
-                               list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
-                                   hdr);
-                       }
-                       mutex_exit(&l2arc_buflist_mtx);
-               } else {
-                       list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+                       l2hdr = hdr->b_l2hdr;
                }
-               ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-               kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
-               if (hdr->b_state == arc_l2c_only)
-                       l2arc_hdr_stat_remove();
-               hdr->b_l2hdr = NULL;
+
+               if (l2hdr != NULL) {
+                       list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+                       ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+                       kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                       if (hdr->b_state == arc_l2c_only)
+                               l2arc_hdr_stat_remove();
+                       hdr->b_l2hdr = NULL;
+               }
+
+               if (!buflist_held)
+                       mutex_exit(&l2arc_buflist_mtx);
        }
 
        if (!BUF_EMPTY(hdr)) {
                ASSERT(!HDR_IN_HASH_TABLE(hdr));
-               bzero(&hdr->b_dva, sizeof (dva_t));
-               hdr->b_birth = 0;
-               hdr->b_cksum0 = 0;
+               buf_discard_identity(hdr);
        }
        while (hdr->b_buf) {
                arc_buf_t *buf = hdr->b_buf;
 
                if (buf->b_efunc) {
                        mutex_enter(&arc_eviction_mtx);
-                       rw_enter(&buf->b_lock, RW_WRITER);
+                       mutex_enter(&buf->b_evict_lock);
                        ASSERT(buf->b_hdr != NULL);
                        arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
                        hdr->b_buf = buf->b_next;
                        buf->b_hdr = &arc_eviction_hdr;
                        buf->b_next = arc_eviction_list;
                        arc_eviction_list = buf;
-                       rw_exit(&buf->b_lock);
+                       mutex_exit(&buf->b_evict_lock);
                        mutex_exit(&arc_eviction_mtx);
                } else {
                        arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1437,6 +1493,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                hdr->b_freeze_cksum = NULL;
        }
+       if (hdr->b_thawed) {
+               kmem_free(hdr->b_thawed, 1);
+               hdr->b_thawed = NULL;
+       }
 
        ASSERT(!list_link_active(&hdr->b_arc_node));
        ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1457,11 +1517,17 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                kmutex_t *hash_lock = HDR_LOCK(hdr);
 
                mutex_enter(hash_lock);
+               hdr = buf->b_hdr;
+               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
                (void) remove_reference(hdr, hash_lock, tag);
-               if (hdr->b_datacnt > 1)
+               if (hdr->b_datacnt > 1) {
                        arc_buf_destroy(buf, FALSE, TRUE);
-               else
+               } else {
+                       ASSERT(buf == hdr->b_buf);
+                       ASSERT(buf->b_efunc == NULL);
                        hdr->b_flags |= ARC_BUF_AVAILABLE;
+               }
                mutex_exit(hash_lock);
        } else if (HDR_IO_IN_PROGRESS(hdr)) {
                int destroy_hdr;
@@ -1478,12 +1544,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                if (destroy_hdr)
                        arc_hdr_destroy(hdr);
        } else {
-               if (remove_reference(hdr, NULL, tag) > 0) {
-                       ASSERT(HDR_IO_ERROR(hdr));
+               if (remove_reference(hdr, NULL, tag) > 0)
                        arc_buf_destroy(buf, FALSE, TRUE);
-               } else {
+               else
                        arc_hdr_destroy(hdr);
-               }
        }
 }
 
@@ -1495,11 +1559,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
        int no_callback = (buf->b_efunc == NULL);
 
        if (hdr->b_state == arc_anon) {
+               ASSERT(hdr->b_datacnt == 1);
                arc_buf_free(buf, tag);
                return (no_callback);
        }
 
        mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
        ASSERT(hdr->b_state != arc_anon);
        ASSERT(buf->b_data != NULL);
 
@@ -1509,6 +1576,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
                        arc_buf_destroy(buf, FALSE, TRUE);
        } else if (no_callback) {
                ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+               ASSERT(buf->b_efunc == NULL);
                hdr->b_flags |= ARC_BUF_AVAILABLE;
        }
        ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1561,7 +1629,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                if (HDR_IO_IN_PROGRESS(ab) ||
                    (spa && ab->b_spa != spa) ||
                    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-                   lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+                   ddi_get_lbolt() - ab->b_arc_access <
+                   arc_min_prefetch_lifespan)) {
                        skipped++;
                        continue;
                }
@@ -1576,7 +1645,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                        ASSERT(ab->b_datacnt > 0);
                        while (ab->b_buf) {
                                arc_buf_t *buf = ab->b_buf;
-                               if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+                               if (!mutex_tryenter(&buf->b_evict_lock)) {
                                        missed += 1;
                                        break;
                                }
@@ -1598,13 +1667,28 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                                        buf->b_next = arc_eviction_list;
                                        arc_eviction_list = buf;
                                        mutex_exit(&arc_eviction_mtx);
-                                       rw_exit(&buf->b_lock);
+                                       mutex_exit(&buf->b_evict_lock);
                                } else {
-                                       rw_exit(&buf->b_lock);
+                                       mutex_exit(&buf->b_evict_lock);
                                        arc_buf_destroy(buf,
                                            buf->b_data == stolen, TRUE);
                                }
                        }
+
+                       if (ab->b_l2hdr) {
+                               ARCSTAT_INCR(arcstat_evict_l2_cached,
+                                   ab->b_size);
+                       } else {
+                               if (l2arc_write_eligible(ab->b_spa, ab)) {
+                                       ARCSTAT_INCR(arcstat_evict_l2_eligible,
+                                           ab->b_size);
+                               } else {
+                                       ARCSTAT_INCR(
+                                           arcstat_evict_l2_ineligible,
+                                           ab->b_size);
+                               }
+                       }
+
                        if (ab->b_datacnt == 0) {
                                arc_change_state(evicted_state, ab, hash_lock);
                                ASSERT(HDR_IN_HASH_TABLE(ab));
@@ -1679,6 +1763,9 @@ top:
                if (spa && ab->b_spa != spa)
                        continue;
                hash_lock = HDR_LOCK(ab);
+               /* caller may be trying to modify this buffer, skip it */
+               if (MUTEX_HELD(hash_lock))
+                       continue;
                if (mutex_tryenter(hash_lock)) {
                        ASSERT(!HDR_IO_IN_PROGRESS(ab));
                        ASSERT(ab->b_buf == NULL);
@@ -1799,9 +1886,9 @@ arc_do_user_evicts(void)
        while (arc_eviction_list != NULL) {
                arc_buf_t *buf = arc_eviction_list;
                arc_eviction_list = buf->b_next;
-               rw_enter(&buf->b_lock, RW_WRITER);
+               mutex_enter(&buf->b_evict_lock);
                buf->b_hdr = NULL;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                mutex_exit(&arc_eviction_mtx);
 
                if (buf->b_efunc != NULL)
@@ -2017,12 +2104,12 @@ arc_reclaim_thread(void)
                        }
 
                        /* reset the growth delay for every reclaim */
-                       growtime = lbolt + (arc_grow_retry * hz);
+                       growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
 
                        arc_kmem_reap_now(last_reclaim);
                        arc_warm = B_TRUE;
 
-               } else if (arc_no_grow && lbolt >= growtime) {
+               } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
                        arc_no_grow = FALSE;
                }
 
@@ -2036,7 +2123,7 @@ arc_reclaim_thread(void)
                /* block until needed, or one second, whichever is shorter */
                CALLB_CPR_SAFE_BEGIN(&cpr);
                (void) cv_timedwait(&arc_reclaim_thr_cv,
-                   &arc_reclaim_thr_lock, (lbolt + hz));
+                   &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
        }
 
@@ -2251,6 +2338,8 @@ out:
 static void
 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
+       clock_t now;
+
        ASSERT(MUTEX_HELD(hash_lock));
 
        if (buf->b_state == arc_anon) {
@@ -2261,11 +2350,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 */
 
                ASSERT(buf->b_arc_access == 0);
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
                arc_change_state(arc_mru, buf, hash_lock);
 
        } else if (buf->b_state == arc_mru) {
+               now = ddi_get_lbolt();
+
                /*
                 * If this buffer is here because of a prefetch, then either:
                 * - clear the flag if this is a "referencing" read
@@ -2281,7 +2372,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                                buf->b_flags &= ~ARC_PREFETCH;
                                ARCSTAT_BUMP(arcstat_mru_hits);
                        }
-                       buf->b_arc_access = lbolt;
+                       buf->b_arc_access = now;
                        return;
                }
 
@@ -2290,13 +2381,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * but it is still in the cache. Move it to the MFU
                 * state.
                 */
-               if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+               if (now > buf->b_arc_access + ARC_MINTIME) {
                        /*
                         * More than 125ms have passed since we
                         * instantiated this buffer.  Move it to the
                         * most frequently used state.
                         */
-                       buf->b_arc_access = lbolt;
+                       buf->b_arc_access = now;
                        DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                        arc_change_state(arc_mfu, buf, hash_lock);
                }
@@ -2319,7 +2410,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                        DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                }
 
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                arc_change_state(new_state, buf, hash_lock);
 
                ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2338,7 +2429,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                        ASSERT(list_link_active(&buf->b_arc_node));
                }
                ARCSTAT_BUMP(arcstat_mfu_hits);
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
        } else if (buf->b_state == arc_mfu_ghost) {
                arc_state_t     *new_state = arc_mfu;
                /*
@@ -2356,7 +2447,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                        new_state = arc_mru;
                }
 
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                arc_change_state(new_state, buf, hash_lock);
 
@@ -2366,7 +2457,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * This buffer is on the 2nd Level ARC.
                 */
 
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                arc_change_state(arc_mfu, buf, hash_lock);
        } else {
@@ -2379,7 +2470,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
-       bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+       if (zio == NULL || zio->io_error == 0)
+               bcopy(buf->b_data, arg, buf->b_hdr->b_size);
        VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
@@ -2393,6 +2485,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
                *bufp = NULL;
        } else {
                *bufp = buf;
+               ASSERT(buf->b_data);
        }
 }
 
@@ -2431,7 +2524,7 @@ arc_read_done(zio_t *zio)
        /* byteswap if necessary */
        callback_list = hdr->b_acb;
        ASSERT(callback_list != NULL);
-       if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+       if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
                arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
                    byteswap_uint64_array :
                    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
@@ -2440,6 +2533,16 @@ arc_read_done(zio_t *zio)
 
        arc_cksum_compute(buf, B_FALSE);
 
+       if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+               /*
+                * Only call arc_access on anonymous buffers.  This is because
+                * if we've issued an I/O for an evicted buffer, we've already
+                * called arc_access (to prevent any simultaneous readers from
+                * getting confused).
+                */
+               arc_access(hdr, hash_lock);
+       }
+
        /* create copies of the data buffer for the callers */
        abuf = buf;
        for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2453,8 +2556,11 @@ arc_read_done(zio_t *zio)
        hdr->b_acb = NULL;
        hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
        ASSERT(!HDR_BUF_AVAILABLE(hdr));
-       if (abuf == buf)
+       if (abuf == buf) {
+               ASSERT(buf->b_efunc == NULL);
+               ASSERT(hdr->b_datacnt == 1);
                hdr->b_flags |= ARC_BUF_AVAILABLE;
+       }
 
        ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -2475,14 +2581,6 @@ arc_read_done(zio_t *zio)
        cv_broadcast(&hdr->b_cv);
 
        if (hash_lock) {
-               /*
-                * Only call arc_access on anonymous buffers.  This is because
-                * if we've issued an I/O for an evicted buffer, we've already
-                * called arc_access (to prevent any simultaneous readers from
-                * getting confused).
-                */
-               if (zio->io_error == 0 && hdr->b_state == arc_anon)
-                       arc_access(hdr, hash_lock);
                mutex_exit(hash_lock);
        } else {
                /*
@@ -2536,25 +2634,34 @@ arc_read_done(zio_t *zio)
  * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
        int err;
 
+       if (pbuf == NULL) {
+               /*
+                * XXX This happens from traverse callback funcs, for
+                * the objset_phys_t block.
+                */
+               return (arc_read_nolock(pio, spa, bp, done, private, priority,
+                   zio_flags, arc_flags, zb));
+       }
+
        ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
        ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-       rw_enter(&pbuf->b_lock, RW_READER);
+       rw_enter(&pbuf->b_data_lock, RW_READER);
 
        err = arc_read_nolock(pio, spa, bp, done, private, priority,
            zio_flags, arc_flags, zb);
-       rw_exit(&pbuf->b_lock);
+       rw_exit(&pbuf->b_data_lock);
 
        return (err);
 }
 
 int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
@@ -2565,7 +2672,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
        uint64_t guid = spa_guid(spa);
 
 top:
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+           &hash_lock);
        if (hdr && hdr->b_datacnt > 0) {
 
                *arc_flags |= ARC_CACHED;
@@ -2619,6 +2727,7 @@ top:
                        } else {
                                buf = arc_buf_clone(buf);
                        }
+
                } else if (*arc_flags & ARC_PREFETCH &&
                    refcount_count(&hdr->b_refcnt) == 0) {
                        hdr->b_flags |= ARC_PREFETCH;
@@ -2649,15 +2758,13 @@ top:
                        buf = arc_buf_alloc(spa, size, private, type);
                        hdr = buf->b_hdr;
                        hdr->b_dva = *BP_IDENTITY(bp);
-                       hdr->b_birth = bp->blk_birth;
+                       hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
                        hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
                        exists = buf_hash_insert(hdr, &hash_lock);
                        if (exists) {
                                /* somebody beat us to the hash insert */
                                mutex_exit(hash_lock);
-                               bzero(&hdr->b_dva, sizeof (dva_t));
-                               hdr->b_birth = 0;
-                               hdr->b_cksum0 = 0;
+                               buf_discard_identity(hdr);
                                (void) arc_buf_remove_ref(buf, private);
                                goto top; /* restart the IO request */
                        }
@@ -2692,12 +2799,14 @@ top:
                        buf->b_private = NULL;
                        buf->b_next = NULL;
                        hdr->b_buf = buf;
-                       arc_get_data_buf(buf);
                        ASSERT(hdr->b_datacnt == 0);
                        hdr->b_datacnt = 1;
-
+                       arc_get_data_buf(buf);
+                       arc_access(hdr, hash_lock);
                }
 
+               ASSERT(!GHOST_STATE(hdr->b_state));
+
                acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
                acb->acb_done = done;
                acb->acb_private = private;
@@ -2706,17 +2815,6 @@ top:
                hdr->b_acb = acb;
                hdr->b_flags |= ARC_IO_IN_PROGRESS;
 
-               /*
-                * If the buffer has been evicted, migrate it to a present state
-                * before issuing the I/O.  Once we drop the hash-table lock,
-                * the header will be marked as I/O in progress and have an
-                * attached buffer.  At this point, anybody who finds this
-                * buffer ought to notice that it's legit but has a pending I/O.
-                */
-
-               if (GHOST_STATE(hdr->b_state))
-                       arc_access(hdr, hash_lock);
-
                if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
                    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
                        devw = hdr->b_l2hdr->b_dev->l2ad_writing;
@@ -2732,8 +2830,8 @@ top:
                mutex_exit(hash_lock);
 
                ASSERT3U(hdr->b_size, ==, size);
-               DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
-                   zbookmark_t *, zb);
+               DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+                   uint64_t, size, zbookmark_t *, zb);
                ARCSTAT_BUMP(arcstat_misses);
                ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
                    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
@@ -2819,47 +2917,15 @@ top:
        return (0);
 }
 
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
-       arc_buf_hdr_t *hdr;
-       kmutex_t *hash_mtx;
-       uint64_t guid = spa_guid(spa);
-       int rc = 0;
-
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
-       if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-               arc_buf_t *buf = hdr->b_buf;
-
-               ASSERT(buf);
-               while (buf->b_data == NULL) {
-                       buf = buf->b_next;
-                       ASSERT(buf);
-               }
-               bcopy(buf->b_data, data, hdr->b_size);
-       } else {
-               rc = ENOENT;
-       }
-
-       if (hash_mtx)
-               mutex_exit(hash_mtx);
-
-       return (rc);
-}
-
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
        ASSERT(buf->b_hdr != NULL);
        ASSERT(buf->b_hdr->b_state != arc_anon);
        ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+       ASSERT(buf->b_efunc == NULL);
+       ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
        buf->b_efunc = func;
        buf->b_private = private;
 }
@@ -2876,14 +2942,14 @@ arc_buf_evict(arc_buf_t *buf)
        kmutex_t *hash_lock;
        arc_buf_t **bufp;
 
-       rw_enter(&buf->b_lock, RW_WRITER);
+       mutex_enter(&buf->b_evict_lock);
        hdr = buf->b_hdr;
        if (hdr == NULL) {
                /*
                 * We are in arc_do_user_evicts().
                 */
                ASSERT(buf->b_data == NULL);
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                return (0);
        } else if (buf->b_data == NULL) {
                arc_buf_t copy = *buf; /* structure assignment */
@@ -2892,14 +2958,15 @@ arc_buf_evict(arc_buf_t *buf)
                 * but let arc_do_user_evicts() do the reaping.
                 */
                buf->b_efunc = NULL;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                VERIFY(copy.b_efunc(&copy) == 0);
                return (1);
        }
        hash_lock = HDR_LOCK(hdr);
        mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
-       ASSERT(buf->b_hdr == hdr);
        ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
        ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
@@ -2918,6 +2985,7 @@ arc_buf_evict(arc_buf_t *buf)
                arc_state_t *old_state = hdr->b_state;
                arc_state_t *evicted_state;
 
+               ASSERT(hdr->b_buf == NULL);
                ASSERT(refcount_is_zero(&hdr->b_refcnt));
 
                evicted_state =
@@ -2935,12 +3003,13 @@ arc_buf_evict(arc_buf_t *buf)
                mutex_exit(&old_state->arcs_mtx);
        }
        mutex_exit(hash_lock);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
 
        VERIFY(buf->b_efunc(buf) == 0);
        buf->b_efunc = NULL;
        buf->b_private = NULL;
        buf->b_hdr = NULL;
+       buf->b_next = NULL;
        kmem_cache_free(buf_cache, buf);
        return (1);
 }
@@ -2955,29 +3024,30 @@ void
 arc_release(arc_buf_t *buf, void *tag)
 {
        arc_buf_hdr_t *hdr;
-       kmutex_t *hash_lock;
+       kmutex_t *hash_lock = NULL;
        l2arc_buf_hdr_t *l2hdr;
        uint64_t buf_size;
-       boolean_t released = B_FALSE;
 
-       rw_enter(&buf->b_lock, RW_WRITER);
+       /*
+        * It would be nice to assert that if it's DMU metadata (level >
+        * 0 || it's the dnode file), then it must be syncing context.
+        * But we don't know that information at this level.
+        */
+
+       mutex_enter(&buf->b_evict_lock);
        hdr = buf->b_hdr;
 
        /* this buffer is not on any list */
        ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-       ASSERT(!(hdr->b_flags & ARC_STORED));
 
        if (hdr->b_state == arc_anon) {
                /* this buffer is already released */
-               ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-               ASSERT(BUF_EMPTY(hdr));
                ASSERT(buf->b_efunc == NULL);
-               arc_buf_thaw(buf);
-               rw_exit(&buf->b_lock);
-               released = B_TRUE;
        } else {
                hash_lock = HDR_LOCK(hdr);
                mutex_enter(hash_lock);
+               hdr = buf->b_hdr;
+               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
        }
 
        l2hdr = hdr->b_l2hdr;
@@ -2987,9 +3057,6 @@ arc_release(arc_buf_t *buf, void *tag)
                buf_size = hdr->b_size;
        }
 
-       if (released)
-               goto out;
-
        /*
         * Do we have more than one buf?
         */
@@ -3003,14 +3070,14 @@ arc_release(arc_buf_t *buf, void *tag)
 
                ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
                /*
-                * Pull the data off of this buf and attach it to
-                * a new anonymous buf.
+                * Pull the data off of this hdr and attach it to
+                * a new anonymous hdr.
                 */
                (void) remove_reference(hdr, hash_lock, tag);
                bufp = &hdr->b_buf;
                while (*bufp != buf)
                        bufp = &(*bufp)->b_next;
-               *bufp = (*bufp)->b_next;
+               *bufp = buf->b_next;
                buf->b_next = NULL;
 
                ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3038,26 +3105,25 @@ arc_release(arc_buf_t *buf, void *tag)
                nhdr->b_freeze_cksum = NULL;
                (void) refcount_add(&nhdr->b_refcnt, tag);
                buf->b_hdr = nhdr;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                atomic_add_64(&arc_anon->arcs_size, blksz);
        } else {
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                ASSERT(refcount_count(&hdr->b_refcnt) == 1);
                ASSERT(!list_link_active(&hdr->b_arc_node));
                ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-               arc_change_state(arc_anon, hdr, hash_lock);
+               if (hdr->b_state != arc_anon)
+                       arc_change_state(arc_anon, hdr, hash_lock);
                hdr->b_arc_access = 0;
-               mutex_exit(hash_lock);
+               if (hash_lock)
+                       mutex_exit(hash_lock);
 
-               bzero(&hdr->b_dva, sizeof (dva_t));
-               hdr->b_birth = 0;
-               hdr->b_cksum0 = 0;
+               buf_discard_identity(hdr);
                arc_buf_thaw(buf);
        }
        buf->b_efunc = NULL;
        buf->b_private = NULL;
 
-out:
        if (l2hdr) {
                list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3066,14 +3132,27 @@ out:
        }
 }
 
+/*
+ * Release this buffer.  If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb)
+{
+       arc_release(buf, tag);
+       return (0);
+}
+
 int
 arc_released(arc_buf_t *buf)
 {
        int released;
 
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
        return (released);
 }
 
@@ -3082,9 +3161,9 @@ arc_has_callback(arc_buf_t *buf)
 {
        int callback;
 
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        callback = (buf->b_efunc != NULL);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
        return (callback);
 }
 
@@ -3094,9 +3173,9 @@ arc_referenced(arc_buf_t *buf)
 {
        int referenced;
 
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        referenced = (refcount_count(&buf->b_hdr->b_refcnt));
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
        return (referenced);
 }
 #endif
@@ -3136,21 +3215,28 @@ arc_write_done(zio_t *zio)
        arc_buf_t *buf = callback->awcb_buf;
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
-       hdr->b_acb = NULL;
+       ASSERT(hdr->b_acb == NULL);
+
+       if (zio->io_error == 0) {
+               hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+               hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+               hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+       } else {
+               ASSERT(BUF_EMPTY(hdr));
+       }
 
-       hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-       hdr->b_birth = zio->io_bp->blk_birth;
-       hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
        /*
         * If the block to be written was all-zero, we may have
         * compressed it away.  In this case no write was performed
-        * so there will be no dva/birth-date/checksum.  The buffer
-        * must therefor remain anonymous (and uncached).
+        * so there will be no dva/birth/checksum.  The buffer must
+        * therefore remain anonymous (and uncached).
         */
        if (!BUF_EMPTY(hdr)) {
                arc_buf_hdr_t *exists;
                kmutex_t *hash_lock;
 
+               ASSERT(zio->io_error == 0);
+
                arc_cksum_verify(buf);
 
                exists = buf_hash_insert(hdr, &hash_lock);
@@ -3160,106 +3246,54 @@ arc_write_done(zio_t *zio)
                         * sync-to-convergence, because we remove
                         * buffers from the hash table when we arc_free().
                         */
-                       ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-                       ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-                           BP_IDENTITY(zio->io_bp)));
-                       ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-                           zio->io_bp->blk_birth);
-
-                       ASSERT(refcount_is_zero(&exists->b_refcnt));
-                       arc_change_state(arc_anon, exists, hash_lock);
-                       mutex_exit(hash_lock);
-                       arc_hdr_destroy(exists);
-                       exists = buf_hash_insert(hdr, &hash_lock);
-                       ASSERT3P(exists, ==, NULL);
+                       if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+                               if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+                                       panic("bad overwrite, hdr=%p exists=%p",
+                                           (void *)hdr, (void *)exists);
+                               ASSERT(refcount_is_zero(&exists->b_refcnt));
+                               arc_change_state(arc_anon, exists, hash_lock);
+                               mutex_exit(hash_lock);
+                               arc_hdr_destroy(exists);
+                               exists = buf_hash_insert(hdr, &hash_lock);
+                               ASSERT3P(exists, ==, NULL);
+                       } else {
+                               /* Dedup */
+                               ASSERT(hdr->b_datacnt == 1);
+                               ASSERT(hdr->b_state == arc_anon);
+                               ASSERT(BP_GET_DEDUP(zio->io_bp));
+                               ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+                       }
                }
                hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
                /* if it's not anon, we are doing a scrub */
-               if (hdr->b_state == arc_anon)
+               if (!exists && hdr->b_state == arc_anon)
                        arc_access(hdr, hash_lock);
                mutex_exit(hash_lock);
-       } else if (callback->awcb_done == NULL) {
-               int destroy_hdr;
-               /*
-                * This is an anonymous buffer with no user callback,
-                * destroy it if there are no active references.
-                */
-               mutex_enter(&arc_eviction_mtx);
-               destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-               hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-               mutex_exit(&arc_eviction_mtx);
-               if (destroy_hdr)
-                       arc_hdr_destroy(hdr);
        } else {
                hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
        }
-       hdr->b_flags &= ~ARC_STORED;
 
-       if (callback->awcb_done) {
-               ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-               callback->awcb_done(zio, buf, callback->awcb_private);
-       }
+       ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+       callback->awcb_done(zio, buf, callback->awcb_private);
 
        kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
-       boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
-       /* Determine checksum setting */
-       if (ismd) {
-               /*
-                * Metadata always gets checksummed.  If the data
-                * checksum is multi-bit correctable, and it's not a
-                * ZBT-style checksum, then it's suitable for metadata
-                * as well.  Otherwise, the metadata checksum defaults
-                * to fletcher4.
-                */
-               if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
-                   !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
-                       zp->zp_checksum = wp->wp_oschecksum;
-               else
-                       zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
-       } else {
-               zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
-                   wp->wp_oschecksum);
-       }
-
-       /* Determine compression setting */
-       if (ismd) {
-               /*
-                * XXX -- we should design a compression algorithm
-                * that specializes in arrays of bps.
-                */
-               zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-                   ZIO_COMPRESS_LZJB;
-       } else {
-               zp->zp_compress = zio_compress_select(wp->wp_dncompress,
-                   wp->wp_oscompress);
-       }
-
-       zp->zp_type = wp->wp_type;
-       zp->zp_level = wp->wp_level;
-       zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
        arc_write_callback_t *callback;
        zio_t *zio;
-       zio_prop_t zp;
 
        ASSERT(ready != NULL);
+       ASSERT(done != NULL);
        ASSERT(!HDR_IO_ERROR(hdr));
        ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-       ASSERT(hdr->b_acb == 0);
+       ASSERT(hdr->b_acb == NULL);
        if (l2arc)
                hdr->b_flags |= ARC_L2CACHE;
        callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3268,88 +3302,12 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
        callback->awcb_private = private;
        callback->awcb_buf = buf;
 
-       write_policy(spa, wp, &zp);
-       zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+       zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
            arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
        return (zio);
 }
 
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
-       arc_buf_hdr_t *ab;
-       kmutex_t *hash_lock;
-       zio_t   *zio;
-       uint64_t guid = spa_guid(spa);
-
-       /*
-        * If this buffer is in the cache, release it, so it
-        * can be re-used.
-        */
-       ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-       if (ab != NULL) {
-               /*
-                * The checksum of blocks to free is not always
-                * preserved (eg. on the deadlist).  However, if it is
-                * nonzero, it should match what we have in the cache.
-                */
-               ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-                   bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
-                   bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
-               if (ab->b_state != arc_anon)
-                       arc_change_state(arc_anon, ab, hash_lock);
-               if (HDR_IO_IN_PROGRESS(ab)) {
-                       /*
-                        * This should only happen when we prefetch.
-                        */
-                       ASSERT(ab->b_flags & ARC_PREFETCH);
-                       ASSERT3U(ab->b_datacnt, ==, 1);
-                       ab->b_flags |= ARC_FREED_IN_READ;
-                       if (HDR_IN_HASH_TABLE(ab))
-                               buf_hash_remove(ab);
-                       ab->b_arc_access = 0;
-                       bzero(&ab->b_dva, sizeof (dva_t));
-                       ab->b_birth = 0;
-                       ab->b_cksum0 = 0;
-                       ab->b_buf->b_efunc = NULL;
-                       ab->b_buf->b_private = NULL;
-                       mutex_exit(hash_lock);
-               } else if (refcount_is_zero(&ab->b_refcnt)) {
-                       ab->b_flags |= ARC_FREE_IN_PROGRESS;
-                       mutex_exit(hash_lock);
-                       arc_hdr_destroy(ab);
-                       ARCSTAT_BUMP(arcstat_deleted);
-               } else {
-                       /*
-                        * We still have an active reference on this
-                        * buffer.  This can happen, e.g., from
-                        * dbuf_unoverride().
-                        */
-                       ASSERT(!HDR_IN_HASH_TABLE(ab));
-                       ab->b_arc_access = 0;
-                       bzero(&ab->b_dva, sizeof (dva_t));
-                       ab->b_birth = 0;
-                       ab->b_cksum0 = 0;
-                       ab->b_buf->b_efunc = NULL;
-                       ab->b_buf->b_private = NULL;
-                       mutex_exit(hash_lock);
-               }
-       }
-
-       zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
-       if (arc_flags & ARC_WAIT)
-               return (zio_wait(zio));
-
-       ASSERT(arc_flags & ARC_NOWAIT);
-       zio_nowait(zio);
-
-       return (0);
-}
-
 static int
 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
@@ -3786,12 +3744,11 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
        /*
         * A buffer is *not* eligible for the L2ARC if it:
         * 1. belongs to a different spa.
-        * 2. has no attached buffer.
-        * 3. is already cached on the L2ARC.
-        * 4. has an I/O in progress (it may be an incomplete read).
-        * 5. is flagged not eligible (zfs property).
+        * 2. is already cached on the L2ARC.
+        * 3. has an I/O in progress (it may be an incomplete read).
+        * 4. is flagged not eligible (zfs property).
         */
-       if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL ||
+       if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
            HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
                return (B_FALSE);
 
@@ -3815,7 +3772,7 @@ l2arc_write_size(l2arc_dev_t *dev)
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
-       clock_t interval, next;
+       clock_t interval, next, now;
 
        /*
         * If the ARC lists are busy, increase our write rate; if the
@@ -3828,7 +3785,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
        else
                interval = hz * l2arc_feed_secs;
 
-       next = MAX(lbolt, MIN(lbolt + interval, began + interval));
+       now = ddi_get_lbolt();
+       next = MAX(now, MIN(now + interval, began + interval));
 
        return (next);
 }
@@ -4030,11 +3988,11 @@ l2arc_read_done(zio_t *zio)
        ASSERT(cb != NULL);
        buf = cb->l2rcb_buf;
        ASSERT(buf != NULL);
-       hdr = buf->b_hdr;
-       ASSERT(hdr != NULL);
 
-       hash_lock = HDR_LOCK(hdr);
+       hash_lock = HDR_LOCK(buf->b_hdr);
        mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
        /*
         * Check this survived the L2ARC journey.
@@ -4247,7 +4205,7 @@ top:
        }
        mutex_exit(&l2arc_buflist_mtx);
 
-       spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+       vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
        dev->l2ad_evict = taddr;
 }
 
@@ -4407,15 +4365,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        ARCSTAT_BUMP(arcstat_l2_writes_sent);
        ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+       vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
 
        /*
         * Bump device hand to the device start if it is approaching the end.
         * l2arc_evict() will already have evicted ahead for this case.
         */
        if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-               spa_l2cache_space_update(dev->l2ad_vdev, 0,
-                   dev->l2ad_end - dev->l2ad_hand);
+               vdev_space_update(dev->l2ad_vdev,
+                   dev->l2ad_end - dev->l2ad_hand, 0, 0);
                dev->l2ad_hand = dev->l2ad_start;
                dev->l2ad_evict = dev->l2ad_start;
                dev->l2ad_first = B_FALSE;
@@ -4439,7 +4397,7 @@ l2arc_feed_thread(void)
        l2arc_dev_t *dev;
        spa_t *spa;
        uint64_t size, wrote;
-       clock_t begin, next = lbolt;
+       clock_t begin, next = ddi_get_lbolt();
 
        CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4450,7 +4408,7 @@ l2arc_feed_thread(void)
                (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
                    next);
                CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-               next = lbolt + hz;
+               next = ddi_get_lbolt() + hz;
 
                /*
                 * Quick check for L2ARC devices.
@@ -4461,7 +4419,7 @@ l2arc_feed_thread(void)
                        continue;
                }
                mutex_exit(&l2arc_dev_mtx);
-               begin = lbolt;
+               begin = ddi_get_lbolt();
 
                /*
                 * This selects the next l2arc device to write to, and in
@@ -4566,7 +4524,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
        list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
            offsetof(arc_buf_hdr_t, b_l2node));
 
-       spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+       vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
        /*
         * Add device to global list
index 93b7741..066ccc6 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
-static int
-bplist_hold(bplist_t *bpl)
-{
-       ASSERT(MUTEX_HELD(&bpl->bpl_lock));
-       if (bpl->bpl_dbuf == NULL) {
-               int err = dmu_bonus_hold(bpl->bpl_mos,
-                   bpl->bpl_object, bpl, &bpl->bpl_dbuf);
-               if (err)
-                       return (err);
-               bpl->bpl_phys = bpl->bpl_dbuf->db_data;
-       }
-       return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
-       int size;
-
-       size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
-           BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
-       return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
-           DMU_OT_BPLIST_HDR, size, tx));
-}
 
 void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+bplist_create(bplist_t *bpl)
 {
-       VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
-       dmu_object_info_t doi;
-       int err;
-
-       err = dmu_object_info(mos, object, &doi);
-       if (err)
-               return (err);
-
-       mutex_enter(&bpl->bpl_lock);
-
-       ASSERT(bpl->bpl_dbuf == NULL);
-       ASSERT(bpl->bpl_phys == NULL);
-       ASSERT(bpl->bpl_cached_dbuf == NULL);
-       ASSERT(bpl->bpl_queue == NULL);
-       ASSERT(object != 0);
-       ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
-       ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
-       bpl->bpl_mos = mos;
-       bpl->bpl_object = object;
-       bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
-       bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
-       bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
-       mutex_exit(&bpl->bpl_lock);
-       return (0);
+       mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+           offsetof(bplist_entry_t, bpe_node));
 }
 
 void
-bplist_close(bplist_t *bpl)
-{
-       mutex_enter(&bpl->bpl_lock);
-
-       ASSERT(bpl->bpl_queue == NULL);
-
-       if (bpl->bpl_cached_dbuf) {
-               dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-               bpl->bpl_cached_dbuf = NULL;
-       }
-       if (bpl->bpl_dbuf) {
-               dmu_buf_rele(bpl->bpl_dbuf, bpl);
-               bpl->bpl_dbuf = NULL;
-               bpl->bpl_phys = NULL;
-       }
-
-       mutex_exit(&bpl->bpl_lock);
-}
-
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
-       boolean_t rv;
-
-       if (bpl->bpl_object == 0)
-               return (B_TRUE);
-
-       mutex_enter(&bpl->bpl_lock);
-       VERIFY(0 == bplist_hold(bpl)); /* XXX */
-       rv = (bpl->bpl_phys->bpl_entries == 0);
-       mutex_exit(&bpl->bpl_lock);
-
-       return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
-       int err = 0;
-
-       if (bpl->bpl_cached_dbuf == NULL ||
-           bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
-               if (bpl->bpl_cached_dbuf != NULL)
-                       dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-               err = dmu_buf_hold(bpl->bpl_mos,
-                   bpl->bpl_object, blkid << bpl->bpl_blockshift,
-                   bpl, &bpl->bpl_cached_dbuf);
-               ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
-                   1ULL << bpl->bpl_blockshift);
-       }
-       return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
-       uint64_t blk, off;
-       blkptr_t *bparray;
-       int err;
-
-       mutex_enter(&bpl->bpl_lock);
-
-       err = bplist_hold(bpl);
-       if (err) {
-               mutex_exit(&bpl->bpl_lock);
-               return (err);
-       }
-
-       if (*itorp >= bpl->bpl_phys->bpl_entries) {
-               mutex_exit(&bpl->bpl_lock);
-               return (ENOENT);
-       }
-
-       blk = *itorp >> bpl->bpl_bpshift;
-       off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
-       err = bplist_cache(bpl, blk);
-       if (err) {
-               mutex_exit(&bpl->bpl_lock);
-               return (err);
-       }
-
-       bparray = bpl->bpl_cached_dbuf->db_data;
-       *bp = bparray[off];
-       (*itorp)++;
-       mutex_exit(&bpl->bpl_lock);
-       return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
+bplist_destroy(bplist_t *bpl)
 {
-       uint64_t blk, off;
-       blkptr_t *bparray;
-       int err;
-
-       ASSERT(!BP_IS_HOLE(bp));
-       mutex_enter(&bpl->bpl_lock);
-       err = bplist_hold(bpl);
-       if (err)
-               return (err);
-
-       blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
-       off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
-       err = bplist_cache(bpl, blk);
-       if (err) {
-               mutex_exit(&bpl->bpl_lock);
-               return (err);
-       }
-
-       dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
-       bparray = bpl->bpl_cached_dbuf->db_data;
-       bparray[off] = *bp;
-
-       /* We never need the fill count. */
-       bparray[off].blk_fill = 0;
-
-       /* The bplist will compress better if we can leave off the checksum */
-       bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
-       dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-       bpl->bpl_phys->bpl_entries++;
-       bpl->bpl_phys->bpl_bytes +=
-           bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
-       if (bpl->bpl_havecomp) {
-               bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
-               bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
-       }
-       mutex_exit(&bpl->bpl_lock);
-
-       return (0);
+       list_destroy(&bpl->bpl_list);
+       mutex_destroy(&bpl->bpl_lock);
 }
 
-/*
- * Deferred entry; will be written later by bplist_sync().
- */
 void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
 {
-       bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+       bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
 
-       ASSERT(!BP_IS_HOLE(bp));
        mutex_enter(&bpl->bpl_lock);
-       bpq->bpq_blk = *bp;
-       bpq->bpq_next = bpl->bpl_queue;
-       bpl->bpl_queue = bpq;
+       bpe->bpe_blk = *bp;
+       list_insert_tail(&bpl->bpl_list, bpe);
        mutex_exit(&bpl->bpl_lock);
 }
 
 void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 {
-       bplist_q_t *bpq;
+       bplist_entry_t *bpe;
 
        mutex_enter(&bpl->bpl_lock);
-       while ((bpq = bpl->bpl_queue) != NULL) {
-               bpl->bpl_queue = bpq->bpq_next;
+       while (bpe = list_head(&bpl->bpl_list)) {
+               list_remove(&bpl->bpl_list, bpe);
                mutex_exit(&bpl->bpl_lock);
-               VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
-               kmem_free(bpq, sizeof (*bpq));
+               func(arg, &bpe->bpe_blk, tx);
+               kmem_free(bpe, sizeof (*bpe));
                mutex_enter(&bpl->bpl_lock);
        }
        mutex_exit(&bpl->bpl_lock);
 }
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
-       mutex_enter(&bpl->bpl_lock);
-       ASSERT3P(bpl->bpl_queue, ==, NULL);
-       VERIFY(0 == bplist_hold(bpl));
-       dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-       VERIFY(0 == dmu_free_range(bpl->bpl_mos,
-           bpl->bpl_object, 0, -1ULL, tx));
-       bpl->bpl_phys->bpl_entries = 0;
-       bpl->bpl_phys->bpl_bytes = 0;
-       if (bpl->bpl_havecomp) {
-               bpl->bpl_phys->bpl_comp = 0;
-               bpl->bpl_phys->bpl_uncomp = 0;
-       }
-       mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-       int err;
-
-       mutex_enter(&bpl->bpl_lock);
-
-       err = bplist_hold(bpl);
-       if (err) {
-               mutex_exit(&bpl->bpl_lock);
-               return (err);
-       }
-
-       *usedp = bpl->bpl_phys->bpl_bytes;
-       if (bpl->bpl_havecomp) {
-               *compp = bpl->bpl_phys->bpl_comp;
-               *uncompp = bpl->bpl_phys->bpl_uncomp;
-       }
-       mutex_exit(&bpl->bpl_lock);
-
-       if (!bpl->bpl_havecomp) {
-               uint64_t itor = 0, comp = 0, uncomp = 0;
-               blkptr_t bp;
-
-               while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-                       comp += BP_GET_PSIZE(&bp);
-                       uncomp += BP_GET_UCSIZE(&bp);
-               }
-               if (err == ENOENT)
-                       err = 0;
-               *compp = comp;
-               *uncompp = uncomp;
-       }
-
-       return (err);
-}
-
-/*
- * Return (in *dasizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *dasizep)
-{
-       uint64_t size = 0;
-       uint64_t itor = 0;
-       blkptr_t bp;
-       int err;
-
-       /*
-        * As an optimization, if they want the whole txg range, just
-        * get bpl_bytes rather than iterating over the bps.
-        */
-       if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
-               mutex_enter(&bpl->bpl_lock);
-               err = bplist_hold(bpl);
-               if (err == 0)
-                       *dasizep = bpl->bpl_phys->bpl_bytes;
-               mutex_exit(&bpl->bpl_lock);
-               return (err);
-       }
-
-       while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-               if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
-                       size +=
-                           bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
-               }
-       }
-       if (err == ENOENT)
-               err = 0;
-       *dasizep = size;
-       return (err);
-}
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
new file mode 100644 (file)
index 0000000..f81c48a
--- /dev/null
@@ -0,0 +1,462 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+       int size;
+
+       if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+               size = BPOBJ_SIZE_V0;
+       else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+               size = BPOBJ_SIZE_V1;
+       else
+               size = sizeof (bpobj_phys_t);
+
+       return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+           DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+       int64_t i;
+       bpobj_t bpo;
+       dmu_object_info_t doi;
+       int epb;
+       dmu_buf_t *dbuf = NULL;
+
+       VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+       mutex_enter(&bpo.bpo_lock);
+
+       if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+               goto out;
+
+       VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+       epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+       for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+               uint64_t *objarray;
+               uint64_t offset, blkoff;
+
+               offset = i * sizeof (uint64_t);
+               blkoff = P2PHASE(i, epb);
+
+               if (dbuf == NULL || dbuf->db_offset > offset) {
+                       if (dbuf)
+                               dmu_buf_rele(dbuf, FTAG);
+                       VERIFY3U(0, ==, dmu_buf_hold(os,
+                           bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+               }
+
+               ASSERT3U(offset, >=, dbuf->db_offset);
+               ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+               objarray = dbuf->db_data;
+               bpobj_free(os, objarray[blkoff], tx);
+       }
+       if (dbuf) {
+               dmu_buf_rele(dbuf, FTAG);
+               dbuf = NULL;
+       }
+       VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+       mutex_exit(&bpo.bpo_lock);
+       bpobj_close(&bpo);
+
+       VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+       dmu_object_info_t doi;
+       int err;
+
+       err = dmu_object_info(os, object, &doi);
+       if (err)
+               return (err);
+
+       bzero(bpo, sizeof (*bpo));
+       mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+       ASSERT(bpo->bpo_dbuf == NULL);
+       ASSERT(bpo->bpo_phys == NULL);
+       ASSERT(object != 0);
+       ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+       ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+       bpo->bpo_os = os;
+       bpo->bpo_object = object;
+       bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+       bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+       bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+
+       err = dmu_bonus_hold(bpo->bpo_os,
+           bpo->bpo_object, bpo, &bpo->bpo_dbuf);
+       if (err)
+               return (err);
+       bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+       return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+       /* Lame workaround for closing a bpobj that was never opened. */
+       if (bpo->bpo_object == 0)
+               return;
+
+       dmu_buf_rele(bpo->bpo_dbuf, bpo);
+       if (bpo->bpo_cached_dbuf != NULL)
+               dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+       bpo->bpo_dbuf = NULL;
+       bpo->bpo_phys = NULL;
+       bpo->bpo_cached_dbuf = NULL;
+
+       mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+    boolean_t free)
+{
+       dmu_object_info_t doi;
+       int epb;
+       int64_t i;
+       int err = 0;
+       dmu_buf_t *dbuf = NULL;
+
+       mutex_enter(&bpo->bpo_lock);
+
+       if (free)
+               dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+       for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+               blkptr_t *bparray;
+               blkptr_t *bp;
+               uint64_t offset, blkoff;
+
+               offset = i * sizeof (blkptr_t);
+               blkoff = P2PHASE(i, bpo->bpo_epb);
+
+               if (dbuf == NULL || dbuf->db_offset > offset) {
+                       if (dbuf)
+                               dmu_buf_rele(dbuf, FTAG);
+                       err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+                           FTAG, &dbuf, 0);
+                       if (err)
+                               break;
+               }
+
+               ASSERT3U(offset, >=, dbuf->db_offset);
+               ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+               bparray = dbuf->db_data;
+               bp = &bparray[blkoff];
+               err = func(arg, bp, tx);
+               if (err)
+                       break;
+               if (free) {
+                       bpo->bpo_phys->bpo_bytes -=
+                           bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+                       ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+                       if (bpo->bpo_havecomp) {
+                               bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+                               bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+                       }
+                       bpo->bpo_phys->bpo_num_blkptrs--;
+                       ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+               }
+       }
+       if (dbuf) {
+               dmu_buf_rele(dbuf, FTAG);
+               dbuf = NULL;
+       }
+       if (free) {
+               i++;
+               VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+                   i * sizeof (blkptr_t), -1ULL, tx));
+       }
+       if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+               goto out;
+
+       ASSERT(bpo->bpo_havecomp);
+       err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+       if (err)
+               return (err);
+       epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+       for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+               uint64_t *objarray;
+               uint64_t offset, blkoff;
+               bpobj_t sublist;
+               uint64_t used_before, comp_before, uncomp_before;
+               uint64_t used_after, comp_after, uncomp_after;
+
+               offset = i * sizeof (uint64_t);
+               blkoff = P2PHASE(i, epb);
+
+               if (dbuf == NULL || dbuf->db_offset > offset) {
+                       if (dbuf)
+                               dmu_buf_rele(dbuf, FTAG);
+                       err = dmu_buf_hold(bpo->bpo_os,
+                           bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+                       if (err)
+                               break;
+               }
+
+               ASSERT3U(offset, >=, dbuf->db_offset);
+               ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+               objarray = dbuf->db_data;
+               err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+               if (err)
+                       break;
+               if (free) {
+                       err = bpobj_space(&sublist,
+                           &used_before, &comp_before, &uncomp_before);
+                       if (err)
+                               break;
+               }
+               err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+               if (free) {
+                       VERIFY3U(0, ==, bpobj_space(&sublist,
+                           &used_after, &comp_after, &uncomp_after));
+                       bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+                       ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+                       bpo->bpo_phys->bpo_comp -= comp_before - used_after;
+                       bpo->bpo_phys->bpo_uncomp -=
+                           uncomp_before - uncomp_after;
+               }
+
+               bpobj_close(&sublist);
+               if (err)
+                       break;
+               if (free) {
+                       err = dmu_object_free(bpo->bpo_os,
+                           objarray[blkoff], tx);
+                       if (err)
+                               break;
+                       bpo->bpo_phys->bpo_num_subobjs--;
+                       ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+               }
+       }
+       if (dbuf) {
+               dmu_buf_rele(dbuf, FTAG);
+               dbuf = NULL;
+       }
+       if (free) {
+               VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+                   bpo->bpo_phys->bpo_subobjs,
+                   (i + 1) * sizeof (uint64_t), -1ULL, tx));
+       }
+
+out:
+       /* If there are no entries, there should be no bytes. */
+       ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+           (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+           bpo->bpo_phys->bpo_bytes == 0);
+
+       mutex_exit(&bpo->bpo_lock);
+       return (err);
+}
+
+/*
+ * Iterate and remove the entries.  If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+       return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries.  If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+       return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+       bpobj_t subbpo;
+       uint64_t used, comp, uncomp;
+
+       ASSERT(bpo->bpo_havesubobj);
+       ASSERT(bpo->bpo_havecomp);
+
+       VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+       VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+       bpobj_close(&subbpo);
+
+       if (used == 0) {
+               /* No point in having an empty subobj. */
+               bpobj_free(bpo->bpo_os, subobj, tx);
+               return;
+       }
+
+       dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+       if (bpo->bpo_phys->bpo_subobjs == 0) {
+               bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+                   DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+       }
+
+       mutex_enter(&bpo->bpo_lock);
+       dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+           bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+           sizeof (subobj), &subobj, tx);
+       bpo->bpo_phys->bpo_num_subobjs++;
+       bpo->bpo_phys->bpo_bytes += used;
+       bpo->bpo_phys->bpo_comp += comp;
+       bpo->bpo_phys->bpo_uncomp += uncomp;
+       mutex_exit(&bpo->bpo_lock);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       blkptr_t stored_bp = *bp;
+       uint64_t offset;
+       int blkoff;
+       blkptr_t *bparray;
+
+       ASSERT(!BP_IS_HOLE(bp));
+
+       /* We never need the fill count. */
+       stored_bp.blk_fill = 0;
+
+       /* The bpobj will compress better if we can leave off the checksum */
+       if (!BP_GET_DEDUP(bp))
+               bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+       mutex_enter(&bpo->bpo_lock);
+
+       offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+       blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+       if (bpo->bpo_cached_dbuf == NULL ||
+           offset < bpo->bpo_cached_dbuf->db_offset ||
+           offset >= bpo->bpo_cached_dbuf->db_offset +
+           bpo->bpo_cached_dbuf->db_size) {
+               if (bpo->bpo_cached_dbuf)
+                       dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+               VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+                   offset, bpo, &bpo->bpo_cached_dbuf, 0));
+       }
+
+       dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+       bparray = bpo->bpo_cached_dbuf->db_data;
+       bparray[blkoff] = stored_bp;
+
+       dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+       bpo->bpo_phys->bpo_num_blkptrs++;
+       bpo->bpo_phys->bpo_bytes +=
+           bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+       if (bpo->bpo_havecomp) {
+               bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+               bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+       }
+       mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+       spa_t *spa;
+       uint64_t mintxg;
+       uint64_t maxtxg;
+       uint64_t used;
+       uint64_t comp;
+       uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       struct space_range_arg *sra = arg;
+
+       if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+               sra->used += bp_get_dsize_sync(sra->spa, bp);
+               sra->comp += BP_GET_PSIZE(bp);
+               sra->uncomp += BP_GET_UCSIZE(bp);
+       }
+       return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+       mutex_enter(&bpo->bpo_lock);
+
+       *usedp = bpo->bpo_phys->bpo_bytes;
+       if (bpo->bpo_havecomp) {
+               *compp = bpo->bpo_phys->bpo_comp;
+               *uncompp = bpo->bpo_phys->bpo_uncomp;
+               mutex_exit(&bpo->bpo_lock);
+               return (0);
+       } else {
+               mutex_exit(&bpo->bpo_lock);
+               return (bpobj_space_range(bpo, 0, UINT64_MAX,
+                   usedp, compp, uncompp));
+       }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+       struct space_range_arg sra = { 0 };
+       int err;
+
+       /*
+        * As an optimization, if they want the whole txg range, just
+        * get bpo_bytes rather than iterating over the bps.
+        */
+       if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+               return (bpobj_space(bpo, usedp, compp, uncompp));
+
+       sra.spa = dmu_objset_spa(bpo->bpo_os);
+       sra.mintxg = mintxg;
+       sra.maxtxg = maxtxg;
+
+       err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+       *usedp = sra.used;
+       *compp = sra.comp;
+       *uncompp = sra.uncomp;
+       return (err);
+}
index 1b6f242..42ae439 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
-static zio_done_func_t dbuf_skip_write_ready;
-static zio_done_func_t dbuf_skip_write_done;
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -109,7 +106,7 @@ dmu_buf_impl_t *
 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
        uint64_t obj = dn->dn_object;
        uint64_t hv = DBUF_HASH(os, obj, level, blkid);
        uint64_t idx = hv & h->hash_table_mask;
@@ -140,7 +137,7 @@ static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       objset_impl_t *os = db->db_objset;
+       objset_t *os = db->db_objset;
        uint64_t obj = db->db.db_object;
        int level = db->db_level;
        uint64_t blkid = db->db_blkid;
@@ -285,6 +282,7 @@ static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
        dnode_t *dn = db->db_dnode;
+       dbuf_dirty_record_t *dr;
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
 
@@ -299,24 +297,34 @@ dbuf_verify(dmu_buf_impl_t *db)
                ASSERT3U(db->db.db_object, ==, dn->dn_object);
                ASSERT3P(db->db_objset, ==, dn->dn_objset);
                ASSERT3U(db->db_level, <, dn->dn_nlevels);
-               ASSERT(db->db_blkid == DB_BONUS_BLKID ||
-                   list_head(&dn->dn_dbufs));
+               ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid ==
+                   DMU_SPILL_BLKID || list_head(&dn->dn_dbufs));
        }
-       if (db->db_blkid == DB_BONUS_BLKID) {
+       if (db->db_blkid == DMU_BONUS_BLKID) {
+               ASSERT(dn != NULL);
+               ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+               ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+       } else if (db->db_blkid == DMU_SPILL_BLKID) {
                ASSERT(dn != NULL);
                ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-               ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+               ASSERT3U(db->db.db_offset, ==, 0);
        } else {
                ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
        }
 
+       for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+               ASSERT(dr->dr_dbuf == db);
+
+       for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+               ASSERT(dr->dr_dbuf == db);
+
        /*
         * We can't assert that db_size matches dn_datablksz because it
         * can be momentarily different when another thread is doing
         * dnode_set_blksz().
         */
        if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
-               dbuf_dirty_record_t *dr = db->db_data_pending;
+               dr = db->db_data_pending;
                /*
                 * It should only be modified in syncing context, so
                 * make sure we only have one copy of the data.
@@ -333,8 +341,9 @@ dbuf_verify(dmu_buf_impl_t *db)
                                ASSERT(db->db_parent == NULL);
                        else
                                ASSERT(db->db_parent != NULL);
-                       ASSERT3P(db->db_blkptr, ==,
-                           &dn->dn_phys->dn_blkptr[db->db_blkid]);
+                       if (db->db_blkid != DMU_SPILL_BLKID)
+                               ASSERT3P(db->db_blkptr, ==,
+                                   &dn->dn_phys->dn_blkptr[db->db_blkid]);
                } else {
                        /* db is pointed to by an indirect block */
                        int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
@@ -354,7 +363,8 @@ dbuf_verify(dmu_buf_impl_t *db)
                }
        }
        if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
-           db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+           (db->db_buf == NULL || db->db_buf->b_data) &&
+           db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
            db->db_state != DB_FILL && !dn->dn_free_txg) {
                /*
                 * If the blkptr isn't set but they have nonzero data,
@@ -403,6 +413,29 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
        }
 }
 
+/*
+ * Loan out an arc_buf for read.  Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+       arc_buf_t *abuf;
+
+       mutex_enter(&db->db_mtx);
+       if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+               int blksz = db->db.db_size;
+               mutex_exit(&db->db_mtx);
+               abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+               bcopy(db->db.db_data, abuf->b_data, blksz);
+       } else {
+               abuf = db->db_buf;
+               arc_loan_inuse_buf(abuf, db);
+               dbuf_set_data(db, NULL);
+               mutex_exit(&db->db_mtx);
+       }
+       return (abuf);
+}
+
 uint64_t
 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 {
@@ -439,14 +472,13 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                dbuf_set_data(db, buf);
                db->db_state = DB_CACHED;
        } else {
-               ASSERT(db->db_blkid != DB_BONUS_BLKID);
+               ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                ASSERT3P(db->db_buf, ==, NULL);
                VERIFY(arc_buf_remove_ref(buf, db) == 1);
                db->db_state = DB_UNCACHED;
        }
        cv_broadcast(&db->db_changed);
-       mutex_exit(&db->db_mtx);
-       dbuf_rele(db, NULL);
+       dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
@@ -464,7 +496,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
        ASSERT(db->db_state == DB_UNCACHED);
        ASSERT(db->db_buf == NULL);
 
-       if (db->db_blkid == DB_BONUS_BLKID) {
+       if (db->db_blkid == DMU_BONUS_BLKID) {
                int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
                ASSERT3U(bonuslen, <=, db->db.db_size);
@@ -505,11 +537,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
        if (DBUF_IS_L2CACHEABLE(db))
                aflags |= ARC_L2CACHE;
 
-       zb.zb_objset = db->db_objset->os_dsl_dataset ?
-           db->db_objset->os_dsl_dataset->ds_object : 0;
-       zb.zb_object = db->db.db_object;
-       zb.zb_level = db->db_level;
-       zb.zb_blkid = db->db_blkid;
+       SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+           db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+           db->db.db_object, db->db_level, db->db_blkid);
 
        dbuf_add_ref(db, NULL);
        /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
@@ -519,7 +549,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
        else
                pbuf = db->db_objset->os_phys_buf;
 
-       (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+       (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
            (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
            &aflags, &zb);
@@ -546,7 +576,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
        if ((flags & DB_RF_HAVESTRUCT) == 0)
                rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
 
-       prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+       prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
            (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
            DBUF_IS_CACHEABLE(db);
 
@@ -606,7 +636,7 @@ static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
        ASSERT(!refcount_is_zero(&db->db_holds));
-       ASSERT(db->db_blkid != DB_BONUS_BLKID);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
        mutex_enter(&db->db_mtx);
        while (db->db_state == DB_READ || db->db_state == DB_FILL)
                cv_wait(&db->db_changed, &db->db_mtx);
@@ -651,7 +681,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 
        if (dr == NULL ||
            (dr->dt.dl.dr_data !=
-           ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+           ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
                return;
 
        /*
@@ -662,7 +692,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
         *      just null out the current db_data pointer.
         */
        ASSERT(dr->dr_txg >= txg - 2);
-       if (db->db_blkid == DB_BONUS_BLKID) {
+       if (db->db_blkid == DMU_BONUS_BLKID) {
                /* Note that the data bufs here are zio_bufs */
                dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
                arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -682,23 +712,23 @@ void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
        dmu_buf_impl_t *db = dr->dr_dbuf;
+       blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
        uint64_t txg = dr->dr_txg;
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
        ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
        ASSERT(db->db_level == 0);
 
-       if (db->db_blkid == DB_BONUS_BLKID ||
+       if (db->db_blkid == DMU_BONUS_BLKID ||
            dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
                return;
 
+       ASSERT(db->db_data_pending != dr);
+
        /* free this block */
-       if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
-               /* XXX can get silent EIO here */
-               (void) dsl_free(NULL,
-                   spa_get_dsl(db->db_dnode->dn_objset->os_spa),
-                   txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
-       }
+       if (!BP_IS_HOLE(bp))
+               zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
+
        dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
        /*
         * Release the already-written buffer, so we leave it in
@@ -727,7 +757,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
        uint64_t first_l1 = start >> epbs;
        uint64_t last_l1 = end >> epbs;
 
-       if (end > dn->dn_maxblkid) {
+       if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
                end = dn->dn_maxblkid;
                last_l1 = end >> epbs;
        }
@@ -735,7 +765,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
        mutex_enter(&dn->dn_dbufs_mtx);
        for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
                db_next = list_next(&dn->dn_dbufs, db);
-               ASSERT(db->db_blkid != DB_BONUS_BLKID);
+               ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
                if (db->db_level == 1 &&
                    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
@@ -791,7 +821,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                                 * size to reflect that this buffer may
                                 * contain new data when we sync.
                                 */
-                               if (db->db_blkid > dn->dn_maxblkid)
+                               if (db->db_blkid != DMU_SPILL_BLKID &&
+                                   db->db_blkid > dn->dn_maxblkid)
                                        dn->dn_maxblkid = db->db_blkid;
                                dbuf_unoverride(dr);
                        } else {
@@ -837,7 +868,7 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
        /* If we don't exist or are in a snapshot, we can't be freed */
        if (birth_txg)
                return (ds == NULL ||
-                   dsl_dataset_block_freeable(ds, birth_txg));
+                   dsl_dataset_block_freeable(ds, db->db_blkptr, birth_txg));
        else
                return (FALSE);
 }
@@ -849,7 +880,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
        int osize = db->db.db_size;
        arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-       ASSERT(db->db_blkid != DB_BONUS_BLKID);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
        /* XXX does *this* func really need the lock? */
        ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
@@ -890,11 +921,31 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
        dnode_willuse_space(db->db_dnode, size-osize, tx);
 }
 
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+       objset_t *os = db->db_dnode->dn_objset;
+       zbookmark_t zb;
+
+       ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+       ASSERT(arc_released(os->os_phys_buf) ||
+           list_link_active(&os->os_dsl_dataset->ds_synced_link));
+       ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+       zb.zb_objset = os->os_dsl_dataset ?
+           os->os_dsl_dataset->ds_object : 0;
+       zb.zb_object = db->db.db_object;
+       zb.zb_level = db->db_level;
+       zb.zb_blkid = db->db_blkid;
+       (void) arc_release_bp(db->db_buf, db,
+           db->db_blkptr, os->os_spa, &zb);
+}
+
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
        dnode_t *dn = db->db_dnode;
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
        dbuf_dirty_record_t **drp, *dr;
        int drop_struct_lock = FALSE;
        boolean_t do_free_accounting = B_FALSE;
@@ -946,6 +997,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        }
        mutex_exit(&dn->dn_mtx);
 
+       if (db->db_blkid == DMU_SPILL_BLKID)
+               dn->dn_have_spill = B_TRUE;
+
        /*
         * If this buffer is already dirty, we're done.
         */
@@ -955,13 +1009,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
                drp = &dr->dr_next;
        if (dr && dr->dr_txg == tx->tx_txg) {
-               if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+               if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
                        /*
                         * If this buffer has already been written out,
                         * we now need to reset its state.
                         */
                        dbuf_unoverride(dr);
-                       if (db->db.db_object != DMU_META_DNODE_OBJECT)
+                       if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+                           db->db_state != DB_NOFILL)
                                arc_buf_thaw(db->db_buf);
                }
                mutex_exit(&db->db_mtx);
@@ -995,12 +1050,12 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-       if (db->db_blkid != DB_BONUS_BLKID) {
+       if (db->db_blkid != DMU_BONUS_BLKID) {
                /*
                 * Update the accounting.
                 * Note: we delay "free accounting" until after we drop
                 * the db_mtx.  This keeps us from grabbing other locks
-                * (and possibly deadlocking) in bp_get_dasize() while
+                * (and possibly deadlocking) in bp_get_dsize() while
                 * also holding the db_mtx.
                 */
                dnode_willuse_space(dn, db->db.db_size, tx);
@@ -1017,7 +1072,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                void *data_old = db->db_buf;
 
                if (db->db_state != DB_NOFILL) {
-                       if (db->db_blkid == DB_BONUS_BLKID) {
+                       if (db->db_blkid == DMU_BONUS_BLKID) {
                                dbuf_fix_old_data(db, tx->tx_txg);
                                data_old = db->db.db_data;
                        } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
@@ -1053,7 +1108,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         * and dbuf_dirty.  We win, as though the dbuf_noread() had
         * happened after the free.
         */
-       if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+       if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+           db->db_blkid != DMU_SPILL_BLKID) {
                mutex_enter(&dn->dn_mtx);
                dnode_clear_range(dn, db->db_blkid, 1, tx);
                mutex_exit(&dn->dn_mtx);
@@ -1069,7 +1125,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        mutex_exit(&db->db_mtx);
 
-       if (db->db_blkid == DB_BONUS_BLKID) {
+       if (db->db_blkid == DMU_BONUS_BLKID ||
+           db->db_blkid == DMU_SPILL_BLKID) {
                mutex_enter(&dn->dn_mtx);
                ASSERT(!list_link_active(&dr->dr_dirty_node));
                list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1079,7 +1136,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        } else if (do_free_accounting) {
                blkptr_t *bp = db->db_blkptr;
                int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
-                   bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+                   bp_get_dsize(os->os_spa, bp) : db->db.db_size;
                /*
                 * This is only a guess -- if the dbuf is dirty
                 * in a previous txg, we don't know how much
@@ -1111,6 +1168,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
                        parent = dbuf_hold_level(dn, db->db_level+1,
                            db->db_blkid >> epbs, FTAG);
+                       ASSERT(parent != NULL);
                        parent_held = TRUE;
                }
                if (drop_struct_lock)
@@ -1157,10 +1215,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        dbuf_dirty_record_t *dr, **drp;
 
        ASSERT(txg != 0);
-       ASSERT(db->db_blkid != DB_BONUS_BLKID);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
        mutex_enter(&db->db_mtx);
-
        /*
         * If this buffer is not dirty, we're done.
         */
@@ -1172,6 +1229,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                return (0);
        }
        ASSERT(dr->dr_txg == txg);
+       ASSERT(dr->dr_dbuf == db);
 
        /*
         * If this buffer is currently held, we cannot undirty
@@ -1231,7 +1289,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
                arc_buf_t *buf = db->db_buf;
 
-               ASSERT(arc_released(buf));
+               ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
                dbuf_set_data(db, NULL);
                VERIFY(arc_buf_remove_ref(buf, db) == 1);
                dbuf_evict(db);
@@ -1272,7 +1330,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
-       ASSERT(db->db_blkid != DB_BONUS_BLKID);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
        ASSERT(tx->tx_txg != 0);
        ASSERT(db->db_level == 0);
        ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1294,7 +1352,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        if (db->db_state == DB_FILL) {
                if (db->db_level == 0 && db->db_freed_in_flight) {
-                       ASSERT(db->db_blkid != DB_BONUS_BLKID);
+                       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                        /* we were freed while filling */
                        /* XXX dbuf_undirty? */
                        bzero(db->db.db_data, db->db.db_size);
@@ -1315,7 +1373,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
        ASSERT(!refcount_is_zero(&db->db_holds));
        ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
-       ASSERT(db->db_blkid != DB_BONUS_BLKID);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
        ASSERT(db->db_level == 0);
        ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
        ASSERT(buf != NULL);
@@ -1338,9 +1396,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                (void) dbuf_dirty(db, tx);
                bcopy(buf->b_data, db->db.db_data, db->db.db_size);
                VERIFY(arc_buf_remove_ref(buf, db) == 1);
+               xuio_stat_wbuf_copied();
                return;
        }
 
+       xuio_stat_wbuf_nocopy();
        if (db->db_state == DB_CACHED) {
                dbuf_dirty_record_t *dr = db->db_last_dirty;
 
@@ -1396,7 +1456,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 
        if (db->db_state == DB_CACHED) {
                ASSERT(db->db.db_data != NULL);
-               if (db->db_blkid == DB_BONUS_BLKID) {
+               if (db->db_blkid == DMU_BONUS_BLKID) {
                        zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
                        arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
                }
@@ -1410,7 +1470,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        db->db_state = DB_EVICTING;
        db->db_blkptr = NULL;
 
-       if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+       if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
                list_remove(&dn->dn_dbufs, db);
                dnode_rele(dn, db);
                db->db_dnode = NULL;
@@ -1439,7 +1499,20 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
        *parentp = NULL;
        *bpp = NULL;
 
-       ASSERT(blkid != DB_BONUS_BLKID);
+       ASSERT(blkid != DMU_BONUS_BLKID);
+
+       if (blkid == DMU_SPILL_BLKID) {
+               mutex_enter(&dn->dn_mtx);
+               if (dn->dn_have_spill &&
+                   (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+                       *bpp = &dn->dn_phys->dn_spill;
+               else
+                       *bpp = NULL;
+               dbuf_add_ref(dn->dn_dbuf, NULL);
+               *parentp = dn->dn_dbuf;
+               mutex_exit(&dn->dn_mtx);
+               return (0);
+       }
 
        if (dn->dn_phys->dn_nlevels == 0)
                nlevels = 1;
@@ -1488,7 +1561,7 @@ static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
        dmu_buf_impl_t *db, *odb;
 
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -1512,16 +1585,20 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        db->db_immediate_evict = 0;
        db->db_freed_in_flight = 0;
 
-       if (blkid == DB_BONUS_BLKID) {
+       if (blkid == DMU_BONUS_BLKID) {
                ASSERT3P(parent, ==, dn->dn_dbuf);
                db->db.db_size = DN_MAX_BONUSLEN -
                    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
                ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-               db->db.db_offset = DB_BONUS_BLKID;
+               db->db.db_offset = DMU_BONUS_BLKID;
                db->db_state = DB_UNCACHED;
                /* the bonus dbuf is not placed in the hash table */
                arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
                return (db);
+       } else if (blkid == DMU_SPILL_BLKID) {
+               db->db.db_size = (blkptr != NULL) ?
+                   BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+               db->db.db_offset = 0;
        } else {
                int blocksize =
                    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
@@ -1589,7 +1666,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 {
        ASSERT(refcount_is_zero(&db->db_holds));
 
-       if (db->db_blkid != DB_BONUS_BLKID) {
+       if (db->db_blkid != DMU_BONUS_BLKID) {
                /*
                 * If this dbuf is still on the dn_dbufs list,
                 * remove it from that list.
@@ -1625,7 +1702,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
        dmu_buf_impl_t *db = NULL;
        blkptr_t *bp = NULL;
 
-       ASSERT(blkid != DB_BONUS_BLKID);
+       ASSERT(blkid != DMU_BONUS_BLKID);
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
        if (dnode_block_freed(dn, blkid))
@@ -1648,22 +1725,23 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 
        if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
                if (bp && !BP_IS_HOLE(bp)) {
+                       int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
+                           ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
                        arc_buf_t *pbuf;
+                       dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                        zbookmark_t zb;
-                       zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
-                           dn->dn_objset->os_dsl_dataset->ds_object : 0;
-                       zb.zb_object = dn->dn_object;
-                       zb.zb_level = 0;
-                       zb.zb_blkid = blkid;
+
+                       SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+                           dn->dn_object, 0, blkid);
 
                        if (db)
                                pbuf = db->db_buf;
                        else
                                pbuf = dn->dn_objset->os_phys_buf;
 
-                       (void) arc_read(NULL, dn->dn_objset->os_spa,
-                           bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+                       (void) dsl_read(NULL, dn->dn_objset->os_spa,
+                           bp, pbuf, NULL, NULL, priority,
                            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                            &aflags, &zb);
                }
@@ -1682,7 +1760,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 {
        dmu_buf_impl_t *db, *parent = NULL;
 
-       ASSERT(blkid != DB_BONUS_BLKID);
+       ASSERT(blkid != DMU_BONUS_BLKID);
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
        ASSERT3U(dn->dn_nlevels, >, level);
 
@@ -1731,7 +1809,7 @@ top:
         * still referencing it from db_data, we need to make a copy
         * of it in case we decide we want to dirty it again in this txg.
         */
-       if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+       if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
            dn->dn_object != DMU_META_DNODE_OBJECT &&
            db->db_state == DB_CACHED && db->db_data_pending) {
                dbuf_dirty_record_t *dr = db->db_data_pending;
@@ -1786,7 +1864,33 @@ dbuf_create_bonus(dnode_t *dn)
        ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
        ASSERT(dn->dn_bonus == NULL);
-       dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       if (db->db_blkid != DMU_SPILL_BLKID)
+               return (ENOTSUP);
+       if (blksz == 0)
+               blksz = SPA_MINBLOCKSIZE;
+       if (blksz > SPA_MAXBLOCKSIZE)
+               blksz = SPA_MAXBLOCKSIZE;
+       else
+               blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+       rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER);
+       dbuf_new_size(db, blksz, tx);
+       rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+       return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+       dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1801,9 +1905,20 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
+       mutex_enter(&db->db_mtx);
+       dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
        int64_t holds;
 
-       mutex_enter(&db->db_mtx);
+       ASSERT(MUTEX_HELD(&db->db_mtx));
        DBUF_VERIFY(db);
 
        holds = refcount_remove(&db->db_holds, tag);
@@ -1821,7 +1936,7 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
                dbuf_evict_user(db);
 
        if (holds == 0) {
-               if (db->db_blkid == DB_BONUS_BLKID) {
+               if (db->db_blkid == DMU_BONUS_BLKID) {
                        mutex_exit(&db->db_mtx);
                        dnode_rele(db->db_dnode, db);
                } else if (db->db_buf == NULL) {
@@ -1920,7 +2035,7 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
 
        if (db->db_blkptr)
                res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
-                   db->db_blkptr->blk_birth);
+                   db->db_blkptr, db->db_blkptr->blk_birth);
 
        return (res);
 }
@@ -1934,6 +2049,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
        if (db->db_blkptr != NULL)
                return;
 
+       if (db->db_blkid == DMU_SPILL_BLKID) {
+               db->db_blkptr = &dn->dn_phys->dn_spill;
+               BP_ZERO(db->db_blkptr);
+               return;
+       }
        if (db->db_level == dn->dn_phys->dn_nlevels-1) {
                /*
                 * This buffer was allocated at a time when there was
@@ -2011,7 +2131,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        arc_buf_t **datap = &dr->dt.dl.dr_data;
        dmu_buf_impl_t *db = dr->dr_dbuf;
        dnode_t *dn = db->db_dnode;
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
        uint64_t txg = tx->tx_txg;
 
        ASSERT(dmu_tx_is_syncing(tx));
@@ -2034,13 +2154,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        }
        DBUF_VERIFY(db);
 
+       if (db->db_blkid == DMU_SPILL_BLKID) {
+               mutex_enter(&dn->dn_mtx);
+               dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+               mutex_exit(&dn->dn_mtx);
+       }
+
        /*
         * If this is a bonus buffer, simply copy the bonus data into the
         * dnode.  It will be written out when the dnode is synced (and it
         * will be synced, since it must have been dirty for dbuf_sync to
         * be called).
         */
-       if (db->db_blkid == DB_BONUS_BLKID) {
+       if (db->db_blkid == DMU_BONUS_BLKID) {
                dbuf_dirty_record_t **drp;
 
                ASSERT(*datap != NULL);
@@ -2056,12 +2182,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                while (*drp != dr)
                        drp = &(*drp)->dr_next;
                ASSERT(dr->dr_next == NULL);
+               ASSERT(dr->dr_dbuf == db);
                *drp = dr->dr_next;
                kmem_free(dr, sizeof (dbuf_dirty_record_t));
                ASSERT(db->db_dirtycnt > 0);
                db->db_dirtycnt -= 1;
-               mutex_exit(&db->db_mtx);
-               dbuf_rele(db, (void *)(uintptr_t)txg);
+               dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
                return;
        }
 
@@ -2083,44 +2209,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
        }
 
-       /*
-        * If this dbuf has already been written out via an immediate write,
-        * just complete the write by copying over the new block pointer and
-        * updating the accounting via the write-completion functions.
-        */
-       if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-               zio_t zio_fake;
-
-               zio_fake.io_private = &db;
-               zio_fake.io_error = 0;
-               zio_fake.io_bp = db->db_blkptr;
-               zio_fake.io_bp_orig = *db->db_blkptr;
-               zio_fake.io_txg = txg;
-               zio_fake.io_flags = 0;
-
-               *db->db_blkptr = dr->dt.dl.dr_overridden_by;
-               dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-               db->db_data_pending = dr;
-               dr->dr_zio = &zio_fake;
-               mutex_exit(&db->db_mtx);
-
-               ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
-                   BP_IDENTITY(&zio_fake.io_bp_orig)) ||
-                   BP_IS_HOLE(zio_fake.io_bp));
-
-               if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-                       (void) dsl_dataset_block_kill(os->os_dsl_dataset,
-                           &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
-               dbuf_write_ready(&zio_fake, db->db_buf, db);
-               dbuf_write_done(&zio_fake, db->db_buf, db);
-
-               return;
-       }
-
        if (db->db_state != DB_NOFILL &&
            dn->dn_object != DMU_META_DNODE_OBJECT &&
            refcount_count(&db->db_holds) > 1 &&
+           dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
            *datap == db->db_buf) {
                /*
                 * If this buffer is currently "in use" (i.e., there
@@ -2177,141 +2269,51 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
        }
 }
 
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
-       dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn = db->db_dnode;
-       objset_impl_t *os = dn->dn_objset;
-       dmu_buf_impl_t *parent = db->db_parent;
-       uint64_t txg = tx->tx_txg;
-       zbookmark_t zb;
-       writeprops_t wp = { 0 };
-       zio_t *zio;
-
-       if (!BP_IS_HOLE(db->db_blkptr) &&
-           (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
-               /*
-                * Private object buffers are released here rather
-                * than in dbuf_dirty() since they are only modified
-                * in the syncing context and we don't want the
-                * overhead of making multiple copies of the data.
-                */
-               arc_release(data, db);
-       } else if (db->db_state != DB_NOFILL) {
-               ASSERT(arc_released(data));
-               /* XXX why do we need to thaw here? */
-               arc_buf_thaw(data);
-       }
-
-       if (parent != dn->dn_dbuf) {
-               ASSERT(parent && parent->db_data_pending);
-               ASSERT(db->db_level == parent->db_level-1);
-               ASSERT(arc_released(parent->db_buf));
-               zio = parent->db_data_pending->dr_zio;
-       } else {
-               ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
-               ASSERT3P(db->db_blkptr, ==,
-                   &dn->dn_phys->dn_blkptr[db->db_blkid]);
-               zio = dn->dn_zio;
-       }
-
-       ASSERT(db->db_level == 0 || data == db->db_buf);
-       ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
-       ASSERT(zio);
-
-       zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-       zb.zb_object = db->db.db_object;
-       zb.zb_level = db->db_level;
-       zb.zb_blkid = db->db_blkid;
-
-       wp.wp_type = dn->dn_type;
-       wp.wp_level = db->db_level;
-       wp.wp_copies = os->os_copies;
-       wp.wp_dncompress = dn->dn_compress;
-       wp.wp_oscompress = os->os_compress;
-       wp.wp_dnchecksum = dn->dn_checksum;
-       wp.wp_oschecksum = os->os_checksum;
-
-       if (BP_IS_OLDER(db->db_blkptr, txg))
-               (void) dsl_dataset_block_kill(
-                   os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
-       if (db->db_state == DB_NOFILL) {
-               zio_prop_t zp = { 0 };
-
-               write_policy(os->os_spa, &wp, &zp);
-               dr->dr_zio = zio_write(zio, os->os_spa,
-                   txg, db->db_blkptr, NULL,
-                   db->db.db_size, &zp, dbuf_skip_write_ready,
-                   dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-                   ZIO_FLAG_MUSTSUCCEED, &zb);
-       } else {
-               dr->dr_zio = arc_write(zio, os->os_spa, &wp,
-                   DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
-                   data, dbuf_write_ready, dbuf_write_done, db,
-                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-       }
-}
-
-/* wrapper function for dbuf_write_ready bypassing ARC */
-static void
-dbuf_skip_write_ready(zio_t *zio)
-{
-       blkptr_t *bp = zio->io_bp;
-
-       if (!BP_IS_GANG(bp))
-               zio_skip_write(zio);
-
-       dbuf_write_ready(zio, NULL, zio->io_private);
-}
-
-/* wrapper function for dbuf_write_done bypassing ARC */
-static void
-dbuf_skip_write_done(zio_t *zio)
-{
-       dbuf_write_done(zio, NULL, zio->io_private);
-}
-
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
        dmu_buf_impl_t *db = vdb;
-       dnode_t *dn = db->db_dnode;
-       objset_impl_t *os = dn->dn_objset;
        blkptr_t *bp = zio->io_bp;
        blkptr_t *bp_orig = &zio->io_bp_orig;
+       dnode_t *dn = db->db_dnode;
+       spa_t *spa = zio->io_spa;
+       int64_t delta;
        uint64_t fill = 0;
-       int old_size, new_size, i;
+       int i;
 
        ASSERT(db->db_blkptr == bp);
 
-       dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
-       old_size = bp_get_dasize(os->os_spa, bp_orig);
-       new_size = bp_get_dasize(os->os_spa, bp);
-
-       dnode_diduse_space(dn, new_size - old_size);
+       delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+       dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+       zio->io_prev_space_delta = delta;
 
        if (BP_IS_HOLE(bp)) {
-               dsl_dataset_t *ds = os->os_dsl_dataset;
-               dmu_tx_t *tx = os->os_synctx;
-
-               if (bp_orig->blk_birth == tx->tx_txg)
-                       (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
-               ASSERT3U(bp->blk_fill, ==, 0);
+               ASSERT(bp->blk_fill == 0);
                return;
        }
 
-       ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+       ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+           BP_GET_TYPE(bp) == dn->dn_type) ||
+           (db->db_blkid == DMU_SPILL_BLKID &&
+           BP_GET_TYPE(bp) == dn->dn_bonustype));
        ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 
        mutex_enter(&db->db_mtx);
 
+#ifdef ZFS_DEBUG
+       if (db->db_blkid == DMU_SPILL_BLKID) {
+               dnode_t *dn = db->db_dnode;
+               ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+               ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+                   db->db_blkptr == &dn->dn_phys->dn_spill);
+       }
+#endif
+
        if (db->db_level == 0) {
                mutex_enter(&dn->dn_mtx);
-               if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+               if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+                   db->db_blkid != DMU_SPILL_BLKID)
                        dn->dn_phys->dn_maxblkid = db->db_blkid;
                mutex_exit(&dn->dn_mtx);
 
@@ -2331,9 +2333,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                        if (BP_IS_HOLE(ibp))
                                continue;
-                       ASSERT3U(BP_GET_LSIZE(ibp), ==,
-                           db->db_level == 1 ? dn->dn_datablksz :
-                           (1<<dn->dn_phys->dn_indblkshift));
                        fill += ibp->blk_fill;
                }
        }
@@ -2341,17 +2340,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        bp->blk_fill = fill;
 
        mutex_exit(&db->db_mtx);
-
-       if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-               ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
-       } else {
-               dsl_dataset_t *ds = os->os_dsl_dataset;
-               dmu_tx_t *tx = os->os_synctx;
-
-               if (bp_orig->blk_birth == tx->tx_txg)
-                       (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
-               dsl_dataset_block_born(ds, bp, tx);
-       }
 }
 
 /* ARGSUSED */
@@ -2359,37 +2347,59 @@ static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
        dmu_buf_impl_t *db = vdb;
+       blkptr_t *bp = zio->io_bp;
+       blkptr_t *bp_orig = &zio->io_bp_orig;
+       dnode_t *dn = db->db_dnode;
+       objset_t *os = dn->dn_objset;
        uint64_t txg = zio->io_txg;
        dbuf_dirty_record_t **drp, *dr;
 
        ASSERT3U(zio->io_error, ==, 0);
+       ASSERT(db->db_blkptr == bp);
+
+       if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+               ASSERT(BP_EQUAL(bp, bp_orig));
+       } else {
+               dsl_dataset_t *ds = os->os_dsl_dataset;
+               dmu_tx_t *tx = os->os_synctx;
+
+               (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+               dsl_dataset_block_born(ds, bp, tx);
+       }
 
        mutex_enter(&db->db_mtx);
 
+       DBUF_VERIFY(db);
+
        drp = &db->db_last_dirty;
        while ((dr = *drp) != db->db_data_pending)
                drp = &dr->dr_next;
        ASSERT(!list_link_active(&dr->dr_dirty_node));
        ASSERT(dr->dr_txg == txg);
+       ASSERT(dr->dr_dbuf == db);
        ASSERT(dr->dr_next == NULL);
        *drp = dr->dr_next;
 
+#ifdef ZFS_DEBUG
+       if (db->db_blkid == DMU_SPILL_BLKID) {
+               dnode_t *dn = db->db_dnode;
+               ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+               ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+                   db->db_blkptr == &dn->dn_phys->dn_spill);
+       }
+#endif
+
        if (db->db_level == 0) {
-               ASSERT(db->db_blkid != DB_BONUS_BLKID);
+               ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
                if (db->db_state != DB_NOFILL) {
                        if (dr->dt.dl.dr_data != db->db_buf)
                                VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
                                    db) == 1);
-                       else if (!BP_IS_HOLE(db->db_blkptr))
+                       else if (!arc_released(db->db_buf))
                                arc_set_callback(db->db_buf, dbuf_do_evict, db);
-                       else
-                               ASSERT(arc_released(db->db_buf));
                }
        } else {
-               dnode_t *dn = db->db_dnode;
-
                ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
                ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
                if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2410,9 +2420,129 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        ASSERT(db->db_dirtycnt > 0);
        db->db_dirtycnt -= 1;
        db->db_data_pending = NULL;
+       dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+       dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+       dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+       dbuf_dirty_record_t *dr = zio->io_private;
+       dmu_buf_impl_t *db = dr->dr_dbuf;
+
+       dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+       dbuf_dirty_record_t *dr = zio->io_private;
+       dmu_buf_impl_t *db = dr->dr_dbuf;
+       blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+       mutex_enter(&db->db_mtx);
+       if (!BP_EQUAL(zio->io_bp, obp)) {
+               if (!BP_IS_HOLE(obp))
+                       dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+               arc_release(dr->dt.dl.dr_data, db);
+       }
        mutex_exit(&db->db_mtx);
 
-       dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+       dbuf_write_done(zio, NULL, db);
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = dr->dr_dbuf;
+       dnode_t *dn = db->db_dnode;
+       objset_t *os = dn->dn_objset;
+       dmu_buf_impl_t *parent = db->db_parent;
+       uint64_t txg = tx->tx_txg;
+       zbookmark_t zb;
+       zio_prop_t zp;
+       zio_t *zio;
+       int wp_flag = 0;
 
-       dbuf_rele(db, (void *)(uintptr_t)txg);
+       if (db->db_state != DB_NOFILL) {
+               if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+                       /*
+                        * Private object buffers are released here rather
+                        * than in dbuf_dirty() since they are only modified
+                        * in the syncing context and we don't want the
+                        * overhead of making multiple copies of the data.
+                        */
+                       if (BP_IS_HOLE(db->db_blkptr)) {
+                               arc_buf_thaw(data);
+                       } else {
+                               dbuf_release_bp(db);
+                       }
+               }
+       }
+
+       if (parent != dn->dn_dbuf) {
+               ASSERT(parent && parent->db_data_pending);
+               ASSERT(db->db_level == parent->db_level-1);
+               ASSERT(arc_released(parent->db_buf));
+               zio = parent->db_data_pending->dr_zio;
+       } else {
+               ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+                   db->db_blkid != DMU_SPILL_BLKID) ||
+                   (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+               if (db->db_blkid != DMU_SPILL_BLKID)
+                       ASSERT3P(db->db_blkptr, ==,
+                           &dn->dn_phys->dn_blkptr[db->db_blkid]);
+               zio = dn->dn_zio;
+       }
+
+       ASSERT(db->db_level == 0 || data == db->db_buf);
+       ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+       ASSERT(zio);
+
+       SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+           os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+           db->db.db_object, db->db_level, db->db_blkid);
+
+       if (db->db_blkid == DMU_SPILL_BLKID)
+               wp_flag = WP_SPILL;
+       wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+       dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+
+       if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+               ASSERT(db->db_state != DB_NOFILL);
+               dr->dr_zio = zio_write(zio, os->os_spa, txg,
+                   db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+                   dbuf_write_override_ready, dbuf_write_override_done, dr,
+                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+               mutex_enter(&db->db_mtx);
+               dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+               zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+                   dr->dt.dl.dr_copies);
+               mutex_exit(&db->db_mtx);
+       } else if (db->db_state == DB_NOFILL) {
+               ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+               dr->dr_zio = zio_write(zio, os->os_spa, txg,
+                   db->db_blkptr, NULL, db->db.db_size, &zp,
+                   dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+                   ZIO_PRIORITY_ASYNC_WRITE,
+                   ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+       } else {
+               ASSERT(arc_released(data));
+               dr->dr_zio = arc_write(zio, os->os_spa, txg,
+                   db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+                   dbuf_write_ready, dbuf_write_done, db,
+                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+       }
 }
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
new file mode 100644 (file)
index 0000000..926b4df
--- /dev/null
@@ -0,0 +1,1140 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+       &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+       "ditto",
+       "duplicate",
+       "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+       spa_t *spa = ddt->ddt_spa;
+       objset_t *os = ddt->ddt_os;
+       uint64_t *objectp = &ddt->ddt_object[type][class];
+       boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+       char name[DDT_NAMELEN];
+
+       ddt_object_name(ddt, type, class, name);
+
+       ASSERT(*objectp == 0);
+       VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+       ASSERT(*objectp != 0);
+
+       VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+           sizeof (uint64_t), 1, objectp, tx) == 0);
+
+       VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+           sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+           &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+       spa_t *spa = ddt->ddt_spa;
+       objset_t *os = ddt->ddt_os;
+       uint64_t *objectp = &ddt->ddt_object[type][class];
+       char name[DDT_NAMELEN];
+
+       ddt_object_name(ddt, type, class, name);
+
+       ASSERT(*objectp != 0);
+       ASSERT(ddt_object_count(ddt, type, class) == 0);
+       ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+       VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+       VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+       VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+       bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+       *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+       ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+       dmu_object_info_t doi;
+       char name[DDT_NAMELEN];
+       int error;
+
+       ddt_object_name(ddt, type, class, name);
+
+       error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+           sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+       if (error)
+               return (error);
+
+       error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+           sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+           &ddt->ddt_histogram[type][class]);
+
+       /*
+        * Seed the cached statistics.
+        */
+       VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+       ddo->ddo_count = ddt_object_count(ddt, type, class);
+       ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+       ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+       ASSERT(error == 0);
+       return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+       ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+       dmu_object_info_t doi;
+       char name[DDT_NAMELEN];
+
+       ddt_object_name(ddt, type, class, name);
+
+       VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+           sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+           &ddt->ddt_histogram[type][class], tx) == 0);
+
+       /*
+        * Cache DDT statistics; this is the only time they'll change.
+        */
+       VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+       ddo->ddo_count = ddt_object_count(ddt, type, class);
+       ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+       ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+       if (!ddt_object_exists(ddt, type, class))
+               return (ENOENT);
+
+       return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+       if (!ddt_object_exists(ddt, type, class))
+               return;
+
+       ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+       ASSERT(ddt_object_exists(ddt, type, class));
+
+       return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+       ASSERT(ddt_object_exists(ddt, type, class));
+
+       return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    uint64_t *walk, ddt_entry_t *dde)
+{
+       ASSERT(ddt_object_exists(ddt, type, class));
+
+       return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+       ASSERT(ddt_object_exists(ddt, type, class));
+
+       return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+           ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_object_info_t *doi)
+{
+       if (!ddt_object_exists(ddt, type, class))
+               return (ENOENT);
+
+       return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+           doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+       return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    char *name)
+{
+       (void) sprintf(name, DMU_POOL_DDT,
+           zio_checksum_table[ddt->ddt_checksum].ci_name,
+           ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+       ASSERT(txg != 0);
+
+       for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+               bp->blk_dva[d] = ddp->ddp_dva[d];
+       BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(enum zio_checksum checksum,
+    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+       BP_ZERO(bp);
+
+       if (ddp != NULL)
+               ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+       bp->blk_cksum = ddk->ddk_cksum;
+       bp->blk_fill = 1;
+
+       BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+       BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+       BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+       BP_SET_CHECKSUM(bp, checksum);
+       BP_SET_TYPE(bp, DMU_OT_DEDUP);
+       BP_SET_LEVEL(bp, 0);
+       BP_SET_DEDUP(bp, 0);
+       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+       ddk->ddk_cksum = bp->blk_cksum;
+       ddk->ddk_prop = 0;
+
+       DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+       DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+       DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+       ASSERT(ddp->ddp_phys_birth == 0);
+
+       for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+               ddp->ddp_dva[d] = bp->blk_dva[d];
+       ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+       bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+       ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+       ASSERT((int64_t)ddp->ddp_refcnt > 0);
+       ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+       blkptr_t blk;
+
+       ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+       ddt_phys_clear(ddp);
+       zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+       ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+                   BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+                       return (ddp);
+       }
+       return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+       uint64_t refcnt = 0;
+
+       for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+               refcnt += dde->dde_phys[p].ddp_refcnt;
+
+       return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+       spa_t *spa = ddt->ddt_spa;
+       ddt_phys_t *ddp = dde->dde_phys;
+       ddt_key_t *ddk = &dde->dde_key;
+       uint64_t lsize = DDK_GET_LSIZE(ddk);
+       uint64_t psize = DDK_GET_PSIZE(ddk);
+
+       bzero(dds, sizeof (*dds));
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               uint64_t dsize = 0;
+               uint64_t refcnt = ddp->ddp_refcnt;
+
+               if (ddp->ddp_phys_birth == 0)
+                       continue;
+
+               for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+                       dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+               dds->dds_blocks += 1;
+               dds->dds_lsize += lsize;
+               dds->dds_psize += psize;
+               dds->dds_dsize += dsize;
+
+               dds->dds_ref_blocks += refcnt;
+               dds->dds_ref_lsize += lsize * refcnt;
+               dds->dds_ref_psize += psize * refcnt;
+               dds->dds_ref_dsize += dsize * refcnt;
+       }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+       const uint64_t *s = (const uint64_t *)src;
+       uint64_t *d = (uint64_t *)dst;
+       uint64_t *d_end = (uint64_t *)(dst + 1);
+
+       ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
+
+       while (d < d_end)
+               *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+       ddt_stat_t dds;
+       ddt_histogram_t *ddh;
+       int bucket;
+
+       ddt_stat_generate(ddt, dde, &dds);
+
+       bucket = highbit(dds.dds_ref_blocks) - 1;
+       ASSERT(bucket >= 0);
+
+       ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+       ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+       for (int h = 0; h < 64; h++)
+               ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+       bzero(dds, sizeof (*dds));
+
+       for (int h = 0; h < 64; h++)
+               ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+       const uint64_t *s = (const uint64_t *)ddh;
+       const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+       while (s < s_end)
+               if (*s++ != 0)
+                       return (B_FALSE);
+
+       return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+       /* Sum the statistics we cached in ddt_object_sync(). */
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               ddt_object_t *ddo =
+                                   &ddt->ddt_object_stats[type][class];
+                               ddo_total->ddo_count += ddo->ddo_count;
+                               ddo_total->ddo_dspace += ddo->ddo_dspace;
+                               ddo_total->ddo_mspace += ddo->ddo_mspace;
+                       }
+               }
+       }
+
+       /* ... and compute the averages. */
+       if (ddo_total->ddo_count != 0) {
+               ddo_total->ddo_dspace /= ddo_total->ddo_count;
+               ddo_total->ddo_mspace /= ddo_total->ddo_count;
+       } else {
+               ASSERT(ddo_total->ddo_dspace == 0);
+               ASSERT(ddo_total->ddo_mspace == 0);
+       }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               ddt_histogram_add(ddh,
+                                   &ddt->ddt_histogram_cache[type][class]);
+                       }
+               }
+       }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+       ddt_histogram_t *ddh_total;
+
+       ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+       ddt_get_dedup_histogram(spa, ddh_total);
+       ddt_histogram_stat(dds_total, ddh_total);
+       kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+       ddt_stat_t dds_total = { 0 };
+
+       ddt_get_dedup_stats(spa, &dds_total);
+       return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+       ddt_stat_t dds_total = { 0 };
+
+       ddt_get_dedup_stats(spa, &dds_total);
+       if (dds_total.dds_dsize == 0)
+               return (100);
+
+       return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+       spa_t *spa = ddt->ddt_spa;
+       uint64_t total_refcnt = 0;
+       uint64_t ditto = spa->spa_dedup_ditto;
+       int total_copies = 0;
+       int desired_copies = 0;
+
+       for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+               ddt_phys_t *ddp = &dde->dde_phys[p];
+               zio_t *zio = dde->dde_lead_zio[p];
+               uint64_t refcnt = ddp->ddp_refcnt;      /* committed refs */
+               if (zio != NULL)
+                       refcnt += zio->io_parent_count; /* pending refs */
+               if (ddp == ddp_willref)
+                       refcnt++;                       /* caller's ref */
+               if (refcnt != 0) {
+                       total_refcnt += refcnt;
+                       total_copies += p;
+               }
+       }
+
+       if (ditto == 0 || ditto > UINT32_MAX)
+               ditto = UINT32_MAX;
+
+       if (total_refcnt >= 1)
+               desired_copies++;
+       if (total_refcnt >= ditto)
+               desired_copies++;
+       if (total_refcnt >= ditto * ditto)
+               desired_copies++;
+
+       return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+       ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+       dva_t *dva = ddp->ddp_dva;
+       int copies = 0 - DVA_GET_GANG(dva);
+
+       for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+               if (DVA_IS_VALID(dva))
+                       copies++;
+
+       ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+       return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+       uchar_t *version = dst++;
+       int cpfunc = ZIO_COMPRESS_ZLE;
+       zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+       size_t c_len;
+
+       ASSERT(d_len >= s_len + 1);     /* no compression plus version byte */
+
+       c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+       if (c_len == s_len) {
+               cpfunc = ZIO_COMPRESS_OFF;
+               bcopy(src, dst, s_len);
+       }
+
+       *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+       return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+       uchar_t version = *src++;
+       int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+       zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+       if (ci->ci_decompress != NULL)
+               (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+       else
+               bcopy(src, dst, d_len);
+
+       if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+               byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+       return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+       return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+       mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+       mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+       ddt_entry_t *dde;
+
+       dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+       cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+       dde->dde_key = *ddk;
+
+       return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+       ASSERT(!dde->dde_loading);
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++)
+               ASSERT(dde->dde_lead_zio[p] == NULL);
+
+       if (dde->dde_repair_data != NULL)
+               zio_buf_free(dde->dde_repair_data,
+                   DDK_GET_PSIZE(&dde->dde_key));
+
+       cv_destroy(&dde->dde_cv);
+       kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+       ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+       avl_remove(&ddt->ddt_tree, dde);
+       ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+       ddt_entry_t *dde, dde_search;
+       enum ddt_type type;
+       enum ddt_class class;
+       avl_index_t where;
+       int error;
+
+       ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+       ddt_key_fill(&dde_search.dde_key, bp);
+
+       dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+       if (dde == NULL) {
+               if (!add)
+                       return (NULL);
+               dde = ddt_alloc(&dde_search.dde_key);
+               avl_insert(&ddt->ddt_tree, dde, where);
+       }
+
+       while (dde->dde_loading)
+               cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+       if (dde->dde_loaded)
+               return (dde);
+
+       dde->dde_loading = B_TRUE;
+
+       ddt_exit(ddt);
+
+       error = ENOENT;
+
+       for (type = 0; type < DDT_TYPES; type++) {
+               for (class = 0; class < DDT_CLASSES; class++) {
+                       error = ddt_object_lookup(ddt, type, class, dde);
+                       if (error != ENOENT)
+                               break;
+               }
+               if (error != ENOENT)
+                       break;
+       }
+
+       ASSERT(error == 0 || error == ENOENT);
+
+       ddt_enter(ddt);
+
+       ASSERT(dde->dde_loaded == B_FALSE);
+       ASSERT(dde->dde_loading == B_TRUE);
+
+       dde->dde_type = type;   /* will be DDT_TYPES if no entry found */
+       dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+       dde->dde_loaded = B_TRUE;
+       dde->dde_loading = B_FALSE;
+
+       if (error == 0)
+               ddt_stat_update(ddt, dde, -1ULL);
+
+       cv_broadcast(&dde->dde_cv);
+
+       return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+       ddt_t *ddt;
+       ddt_entry_t dde;
+
+       if (!BP_GET_DEDUP(bp))
+               return;
+
+       /*
+        * We remove the DDT once it's empty and only prefetch dedup blocks
+        * when there are entries in the DDT.  Thus no locking is required
+        * as the DDT can't disappear on us.
+        */
+       ddt = ddt_select(spa, bp);
+       ddt_key_fill(&dde.dde_key, bp);
+
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       ddt_object_prefetch(ddt, type, class, &dde);
+               }
+       }
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+       const ddt_entry_t *dde1 = x1;
+       const ddt_entry_t *dde2 = x2;
+       const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+       const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+       for (int i = 0; i < DDT_KEY_WORDS; i++) {
+               if (u1[i] < u2[i])
+                       return (-1);
+               if (u1[i] > u2[i])
+                       return (1);
+       }
+
+       return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+       ddt_t *ddt;
+
+       ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+       mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+       avl_create(&ddt->ddt_tree, ddt_entry_compare,
+           sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+       avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+           sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+       ddt->ddt_checksum = c;
+       ddt->ddt_spa = spa;
+       ddt->ddt_os = spa->spa_meta_objset;
+
+       return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+       ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+       ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+       avl_destroy(&ddt->ddt_tree);
+       avl_destroy(&ddt->ddt_repair_tree);
+       mutex_destroy(&ddt->ddt_lock);
+       kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+       spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+               spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+       int error;
+
+       ddt_create(spa);
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+           &spa->spa_ddt_stat_object);
+
+       if (error)
+               return (error == ENOENT ? 0 : error);
+
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               error = ddt_object_load(ddt, type, class);
+                               if (error != 0 && error != ENOENT)
+                                       return (error);
+                       }
+               }
+
+               /*
+                * Seed the cached histograms.
+                */
+               bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+                   sizeof (ddt->ddt_histogram));
+       }
+
+       return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               if (spa->spa_ddt[c]) {
+                       ddt_table_free(spa->spa_ddt[c]);
+                       spa->spa_ddt[c] = NULL;
+               }
+       }
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+       ddt_t *ddt;
+       ddt_entry_t dde;
+
+       if (!BP_GET_DEDUP(bp))
+               return (B_FALSE);
+
+       if (max_class == DDT_CLASS_UNIQUE)
+               return (B_TRUE);
+
+       ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+       ddt_key_fill(&dde.dde_key, bp);
+
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+               for (enum ddt_class class = 0; class <= max_class; class++)
+                       if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+                               return (B_TRUE);
+
+       return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+       ddt_key_t ddk;
+       ddt_entry_t *dde;
+
+       ddt_key_fill(&ddk, bp);
+
+       dde = ddt_alloc(&ddk);
+
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       /*
+                        * We can only do repair if there are multiple copies
+                        * of the block.  For anything in the UNIQUE class,
+                        * there's definitely only one copy, so don't even try.
+                        */
+                       if (class != DDT_CLASS_UNIQUE &&
+                           ddt_object_lookup(ddt, type, class, dde) == 0)
+                               return (dde);
+               }
+       }
+
+       bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+       return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+       avl_index_t where;
+
+       ddt_enter(ddt);
+
+       if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+           avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+               avl_insert(&ddt->ddt_repair_tree, dde, where);
+       else
+               ddt_free(dde);
+
+       ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+       ddt_entry_t *rdde = zio->io_private;
+
+       ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+       ddt_phys_t *ddp = dde->dde_phys;
+       ddt_phys_t *rddp = rdde->dde_phys;
+       ddt_key_t *ddk = &dde->dde_key;
+       ddt_key_t *rddk = &rdde->dde_key;
+       zio_t *zio;
+       blkptr_t blk;
+
+       zio = zio_null(rio, rio->io_spa, NULL,
+           ddt_repair_entry_done, rdde, rio->io_flags);
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+               if (ddp->ddp_phys_birth == 0 ||
+                   ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+                   bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+                       continue;
+               ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+               zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+                   rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+                   ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+       }
+
+       zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+       spa_t *spa = ddt->ddt_spa;
+       ddt_entry_t *dde, *rdde_next, *rdde;
+       avl_tree_t *t = &ddt->ddt_repair_tree;
+       blkptr_t blk;
+
+       if (spa_sync_pass(spa) > 1)
+               return;
+
+       ddt_enter(ddt);
+       for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+               rdde_next = AVL_NEXT(t, rdde);
+               avl_remove(&ddt->ddt_repair_tree, rdde);
+               ddt_exit(ddt);
+               ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+               dde = ddt_repair_start(ddt, &blk);
+               ddt_repair_entry(ddt, dde, rdde, rio);
+               ddt_repair_done(ddt, dde);
+               ddt_enter(ddt);
+       }
+       ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+       dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+       ddt_phys_t *ddp = dde->dde_phys;
+       ddt_key_t *ddk = &dde->dde_key;
+       enum ddt_type otype = dde->dde_type;
+       enum ddt_type ntype = DDT_TYPE_CURRENT;
+       enum ddt_class oclass = dde->dde_class;
+       enum ddt_class nclass;
+       uint64_t total_refcnt = 0;
+
+       ASSERT(dde->dde_loaded);
+       ASSERT(!dde->dde_loading);
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               ASSERT(dde->dde_lead_zio[p] == NULL);
+               ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+               if (ddp->ddp_phys_birth == 0) {
+                       ASSERT(ddp->ddp_refcnt == 0);
+                       continue;
+               }
+               if (p == DDT_PHYS_DITTO) {
+                       if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+                               ddt_phys_free(ddt, ddk, ddp, txg);
+                       continue;
+               }
+               if (ddp->ddp_refcnt == 0)
+                       ddt_phys_free(ddt, ddk, ddp, txg);
+               total_refcnt += ddp->ddp_refcnt;
+       }
+
+       if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+               nclass = DDT_CLASS_DITTO;
+       else if (total_refcnt > 1)
+               nclass = DDT_CLASS_DUPLICATE;
+       else
+               nclass = DDT_CLASS_UNIQUE;
+
+       if (otype != DDT_TYPES &&
+           (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+               VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+               ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+       }
+
+       if (total_refcnt != 0) {
+               dde->dde_type = ntype;
+               dde->dde_class = nclass;
+               ddt_stat_update(ddt, dde, 0);
+               if (!ddt_object_exists(ddt, ntype, nclass))
+                       ddt_object_create(ddt, ntype, nclass, tx);
+               VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+               /*
+                * If the class changes, the order that we scan this bp
+                * changes.  If it decreases, we could miss it, so
+                * scan it right now.  (This covers both class changing
+                * while we are doing ddt_walk(), and when we are
+                * traversing.)
+                */
+               if (nclass < oclass) {
+                       dsl_scan_ddt_entry(dp->dp_scan,
+                           ddt->ddt_checksum, dde, tx);
+               }
+       }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+       spa_t *spa = ddt->ddt_spa;
+       ddt_entry_t *dde;
+       void *cookie = NULL;
+
+       if (avl_numnodes(&ddt->ddt_tree) == 0)
+               return;
+
+       ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+       if (spa->spa_ddt_stat_object == 0) {
+               spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+                   DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+               VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+                   &spa->spa_ddt_stat_object, tx) == 0);
+       }
+
+       while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+               ddt_sync_entry(ddt, dde, tx, txg);
+               ddt_free(dde);
+       }
+
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       if (!ddt_object_exists(ddt, type, class))
+                               continue;
+                       ddt_object_sync(ddt, type, class, tx);
+                       if (ddt_object_count(ddt, type, class) == 0)
+                               ddt_object_destroy(ddt, type, class, tx);
+               }
+       }
+
+       bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+           sizeof (ddt->ddt_histogram));
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+       dmu_tx_t *tx;
+       zio_t *rio = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+       ASSERT(spa_syncing_txg(spa) == txg);
+
+       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               if (ddt == NULL)
+                       continue;
+               ddt_sync_table(ddt, tx, txg);
+               ddt_repair_table(ddt, rio);
+       }
+
+       (void) zio_wait(rio);
+
+       dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+       do {
+               do {
+                       do {
+                               ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+                               int error = ENOENT;
+                               if (ddt_object_exists(ddt, ddb->ddb_type,
+                                   ddb->ddb_class)) {
+                                       error = ddt_object_walk(ddt,
+                                           ddb->ddb_type, ddb->ddb_class,
+                                           &ddb->ddb_cursor, dde);
+                               }
+                               dde->dde_type = ddb->ddb_type;
+                               dde->dde_class = ddb->ddb_class;
+                               if (error == 0)
+                                       return (0);
+                               if (error != ENOENT)
+                                       return (error);
+                               ddb->ddb_cursor = 0;
+                       } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+                       ddb->ddb_checksum = 0;
+               } while (++ddb->ddb_type < DDT_TYPES);
+               ddb->ddb_type = 0;
+       } while (++ddb->ddb_class < DDT_CLASSES);
+
+       return (ENOENT);
+}
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
new file mode 100644 (file)
index 0000000..d6a991c
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <util/sscanf.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+       zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+       if (prehash)
+               flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+       *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+           ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+           DMU_OT_NONE, 0, tx);
+
+       return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+       return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+       uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+       uint64_t one, csize;
+       int error;
+
+       error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+           DDT_KEY_WORDS, &one, &csize);
+       if (error)
+               return (error);
+
+       ASSERT(one == 1);
+       ASSERT(csize <= sizeof (cbuf));
+
+       error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+           DDT_KEY_WORDS, 1, csize, cbuf);
+       if (error)
+               return (error);
+
+       ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+       return (0);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+       (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+           DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+       uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+       uint64_t csize;
+
+       csize = ddt_compress(dde->dde_phys, cbuf,
+           sizeof (dde->dde_phys), sizeof (cbuf));
+
+       return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+           DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+       return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+           DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       int error;
+
+       zap_cursor_init_serialized(&zc, os, object, *walk);
+       if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+               uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+               uint64_t csize = za.za_num_integers;
+               ASSERT(za.za_integer_length == 1);
+               error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+                   DDT_KEY_WORDS, 1, csize, cbuf);
+               ASSERT(error == 0);
+               if (error == 0) {
+                       ddt_decompress(cbuf, dde->dde_phys, csize,
+                           sizeof (dde->dde_phys));
+                       dde->dde_key = *(ddt_key_t *)za.za_name;
+               }
+               zap_cursor_advance(&zc);
+               *walk = zap_cursor_serialize(&zc);
+       }
+       zap_cursor_fini(&zc);
+       return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+       uint64_t count = 0;
+
+       VERIFY(zap_count(os, object, &count) == 0);
+
+       return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+       "zap",
+       ddt_zap_create,
+       ddt_zap_destroy,
+       ddt_zap_lookup,
+       ddt_zap_prefetch,
+       ddt_zap_update,
+       ddt_zap_remove,
+       ddt_zap_walk,
+       ddt_zap_count,
+};
index d864682..5b87c81 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -40,6 +39,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/sa.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
@@ -51,8 +51,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
        {       byteswap_uint64_array,  TRUE,   "object array"          },
        {       byteswap_uint8_array,   TRUE,   "packed nvlist"         },
        {       byteswap_uint64_array,  TRUE,   "packed nvlist size"    },
-       {       byteswap_uint64_array,  TRUE,   "bplist"                },
-       {       byteswap_uint64_array,  TRUE,   "bplist header"         },
+       {       byteswap_uint64_array,  TRUE,   "bpobj"                 },
+       {       byteswap_uint64_array,  TRUE,   "bpobj header"          },
        {       byteswap_uint64_array,  TRUE,   "SPA space map header"  },
        {       byteswap_uint64_array,  TRUE,   "SPA space map"         },
        {       byteswap_uint64_array,  TRUE,   "ZIL intent log"        },
@@ -84,22 +84,38 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
        {       byteswap_uint8_array,   TRUE,   "FUID table"            },
        {       byteswap_uint64_array,  TRUE,   "FUID table size"       },
        {       zap_byteswap,           TRUE,   "DSL dataset next clones"},
-       {       zap_byteswap,           TRUE,   "scrub work queue"      },
+       {       zap_byteswap,           TRUE,   "scan work queue"       },
        {       zap_byteswap,           TRUE,   "ZFS user/group used"   },
        {       zap_byteswap,           TRUE,   "ZFS user/group quota"  },
        {       zap_byteswap,           TRUE,   "snapshot refcount tags"},
+       {       zap_byteswap,           TRUE,   "DDT ZAP algorithm"     },
+       {       zap_byteswap,           TRUE,   "DDT statistics"        },
+       {       byteswap_uint8_array,   TRUE,   "System attributes"     },
+       {       zap_byteswap,           TRUE,   "SA master node"        },
+       {       zap_byteswap,           TRUE,   "SA attr registration"  },
+       {       zap_byteswap,           TRUE,   "SA attr layouts"       },
+       {       zap_byteswap,           TRUE,   "scan translations"     },
+       {       byteswap_uint8_array,   FALSE,  "deduplicated block"    },
+       {       zap_byteswap,           TRUE,   "DSL deadlist map"      },
+       {       byteswap_uint64_array,  TRUE,   "DSL deadlist map hdr"  },
+       {       zap_byteswap,           TRUE,   "DSL dir clones"        },
+       {       byteswap_uint64_array,  TRUE,   "bpobj subobj"          },
 };
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp)
+    void *tag, dmu_buf_t **dbp, int flags)
 {
        dnode_t *dn;
        uint64_t blkid;
        dmu_buf_impl_t *db;
        int err;
+       int db_flags = DB_RF_CANFAIL;
 
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       if (flags & DMU_READ_NO_PREFETCH)
+               db_flags |= DB_RF_NOPREFETCH;
+
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err)
                return (err);
        blkid = dbuf_whichblock(dn, offset);
@@ -109,7 +125,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
        if (db == NULL) {
                err = EIO;
        } else {
-               err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+               err = dbuf_read(db, NULL, db_flags);
                if (err) {
                        dbuf_rele(db, tag);
                        db = NULL;
@@ -140,6 +156,36 @@ dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
        return (0);
 }
 
+int
+dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx)
+{
+       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+       if (type > DMU_OT_NUMTYPES)
+               return (EINVAL);
+
+       if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+               return (EINVAL);
+
+       dnode_setbonus_type(dn, type, tx);
+       return (0);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+       dnode_t *dn;
+       int error;
+
+       error = dnode_hold(os, object, FTAG, &dn);
+       dbuf_rm_spill(dn, tx);
+       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+       dnode_rm_spill(dn, tx);
+       rw_exit(&dn->dn_struct_rwlock);
+       dnode_rele(dn, FTAG);
+       return (error);
+}
+
 /*
  * returns ENOENT, EIO, or 0.
  */
@@ -150,7 +196,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
        dmu_buf_impl_t *db;
        int error;
 
-       error = dnode_hold(os->os, object, FTAG, &dn);
+       error = dnode_hold(os, object, FTAG, &dn);
        if (error)
                return (error);
 
@@ -170,13 +216,68 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 
        dnode_rele(dn, FTAG);
 
-       VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+       VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
 
        *dbp = &db->db;
        return (0);
 }
 
 /*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+       dmu_buf_impl_t *db = NULL;
+       int err;
+
+       if ((flags & DB_RF_HAVESTRUCT) == 0)
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+       db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+       if ((flags & DB_RF_HAVESTRUCT) == 0)
+               rw_exit(&dn->dn_struct_rwlock);
+
+       ASSERT(db != NULL);
+       err = dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | flags);
+       *dbp = &db->db;
+       return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+       dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode;
+       int err;
+
+       if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA)
+               return (EINVAL);
+       rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+       if (!dn->dn_have_spill) {
+               rw_exit(&dn->dn_struct_rwlock);
+               return (ENOENT);
+       }
+       err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT, tag, dbp);
+       rw_exit(&dn->dn_struct_rwlock);
+       return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+       return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode,
+           0, tag, dbp));
+}
+
+/*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
@@ -282,7 +383,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
        dnode_t *dn;
        int err;
 
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err)
                return (err);
 
@@ -335,7 +436,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
                return;
 
        if (len == 0) {  /* they're interested in the bonus buffer */
-               dn = os->os->os_meta_dnode;
+               dn = os->os_meta_dnode;
 
                if (object == 0 || object >= DN_MAX_OBJECT)
                        return;
@@ -352,7 +453,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
         * already cached, we will do a *synchronous* read in the
         * dnode_hold() call.  The same is true for any indirects.
         */
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err != 0)
                return;
 
@@ -484,7 +585,7 @@ dmu_free_long_range(objset_t *os, uint64_t object,
        dnode_t *dn;
        int err;
 
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err != 0)
                return (err);
        err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
@@ -499,7 +600,7 @@ dmu_free_object(objset_t *os, uint64_t object)
        dmu_tx_t *tx;
        int err;
 
-       err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+       err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
            FTAG, &dn);
        if (err != 0)
                return (err);
@@ -527,7 +628,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
        dnode_t *dn;
-       int err = dnode_hold(os->os, object, FTAG, &dn);
+       int err = dnode_hold(os, object, FTAG, &dn);
        if (err)
                return (err);
        ASSERT(offset < UINT64_MAX);
@@ -545,7 +646,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
        dmu_buf_t **dbp;
        int numbufs, err;
 
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err)
                return (err);
 
@@ -659,12 +760,136 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
        dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+       dmu_xuio_t *priv;
+       uio_t *uio = &xuio->xu_uio;
+
+       uio->uio_iovcnt = nblk;
+       uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+       priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+       priv->cnt = nblk;
+       priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+       priv->iovp = uio->uio_iov;
+       XUIO_XUZC_PRIV(xuio) = priv;
+
+       if (XUIO_XUZC_RW(xuio) == UIO_READ)
+               XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+       else
+               XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+       return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+       dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+       int nblk = priv->cnt;
+
+       kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+       kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+       kmem_free(priv, sizeof (dmu_xuio_t));
+
+       if (XUIO_XUZC_RW(xuio) == UIO_READ)
+               XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+       else
+               XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+       struct iovec *iov;
+       uio_t *uio = &xuio->xu_uio;
+       dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+       int i = priv->next++;
+
+       ASSERT(i < priv->cnt);
+       ASSERT(off + n <= arc_buf_size(abuf));
+       iov = uio->uio_iov + i;
+       iov->iov_base = (char *)abuf->b_data + off;
+       iov->iov_len = n;
+       priv->bufs[i] = abuf;
+       return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+       dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+       return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+       dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+       ASSERT(i < priv->cnt);
+       return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+       dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+       ASSERT(i < priv->cnt);
+       priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+       xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+           KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+           KSTAT_FLAG_VIRTUAL);
+       if (xuio_ksp != NULL) {
+               xuio_ksp->ks_data = &xuio_stats;
+               kstat_install(xuio_ksp);
+       }
+}
+
+static void
+xuio_stat_fini(void)
+{
+       if (xuio_ksp != NULL) {
+               kstat_delete(xuio_ksp);
+               xuio_ksp = NULL;
+       }
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+       XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+       XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
 #ifdef _KERNEL
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
        dmu_buf_t **dbp;
        int numbufs, i, err;
+       xuio_t *xuio = NULL;
 
        /*
         * NB: we could do this block-at-a-time, but it's nice
@@ -675,6 +900,9 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
        if (err)
                return (err);
 
+       if (uio->uio_extflg == UIO_XUIO)
+               xuio = (xuio_t *)uio;
+
        for (i = 0; i < numbufs; i++) {
                int tocpy;
                int bufoff;
@@ -685,8 +913,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
                bufoff = uio->uio_loffset - db->db_offset;
                tocpy = (int)MIN(db->db_size - bufoff, size);
 
-               err = uiomove((char *)db->db_data + bufoff, tocpy,
-                   UIO_READ, uio);
+               if (xuio) {
+                       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+                       arc_buf_t *dbuf_abuf = dbi->db_buf;
+                       arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+                       err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+                       if (!err) {
+                               uio->uio_resid -= tocpy;
+                               uio->uio_loffset += tocpy;
+                       }
+
+                       if (abuf == dbuf_abuf)
+                               XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+                       else
+                               XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+               } else {
+                       err = uiomove((char *)db->db_data + bufoff, tocpy,
+                           UIO_READ, uio);
+               }
                if (err)
                        break;
 
@@ -697,19 +941,16 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
        return (err);
 }
 
-int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
-    dmu_tx_t *tx)
+static int
+dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
        dmu_buf_t **dbp;
-       int numbufs, i;
+       int numbufs;
        int err = 0;
+       int i;
 
-       if (size == 0)
-               return (0);
-
-       err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
-           FALSE, FTAG, &numbufs, &dbp);
+       err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+           FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
        if (err)
                return (err);
 
@@ -747,11 +988,44 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
 
                size -= tocpy;
        }
+
        dmu_buf_rele_array(dbp, numbufs, FTAG);
        return (err);
 }
 
 int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+       if (size == 0)
+               return (0);
+
+       return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode,
+           uio, size, tx));
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+       dnode_t *dn;
+       int err;
+
+       if (size == 0)
+               return (0);
+
+       err = dnode_hold(os, object, FTAG, &dn);
+       if (err)
+               return (err);
+
+       err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+       dnode_rele(dn, FTAG);
+
+       return (err);
+}
+
+int
 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     page_t *pp, dmu_tx_t *tx)
 {
@@ -852,55 +1126,122 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
                dbuf_rele(db, FTAG);
        } else {
                dbuf_rele(db, FTAG);
-               ASSERT(dn->dn_objset->os.os == dn->dn_objset);
-               dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz,
+               dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
                    buf->b_data, tx);
                dmu_return_arcbuf(buf);
+               XUIOSTAT_BUMP(xuiostat_wbuf_copied);
        }
 }
 
 typedef struct {
-       dbuf_dirty_record_t     *dr;
-       dmu_sync_cb_t           *done;
-       void                    *arg;
+       dbuf_dirty_record_t     *dsa_dr;
+       dmu_sync_cb_t           *dsa_done;
+       zgd_t                   *dsa_zgd;
+       dmu_tx_t                *dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
+       dmu_sync_arg_t *dsa = varg;
+       dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
        blkptr_t *bp = zio->io_bp;
 
-       if (!BP_IS_HOLE(bp)) {
-               dmu_sync_arg_t *in = varg;
-               dbuf_dirty_record_t *dr = in->dr;
-               dmu_buf_impl_t *db = dr->dr_dbuf;
-               ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
-               ASSERT(BP_GET_LEVEL(bp) == 0);
-               bp->blk_fill = 1;
+       if (zio->io_error == 0) {
+               if (BP_IS_HOLE(bp)) {
+                       /*
+                        * A block of zeros may compress to a hole, but the
+                        * block size still needs to be known for replay.
+                        */
+                       BP_SET_LSIZE(bp, db->db_size);
+               } else {
+                       ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+                       ASSERT(BP_GET_LEVEL(bp) == 0);
+                       bp->blk_fill = 1;
+               }
        }
 }
 
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+       dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
-       dmu_sync_arg_t *in = varg;
-       dbuf_dirty_record_t *dr = in->dr;
+       dmu_sync_arg_t *dsa = varg;
+       dbuf_dirty_record_t *dr = dsa->dsa_dr;
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dmu_sync_cb_t *done = in->done;
 
        mutex_enter(&db->db_mtx);
        ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
-       dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
-       dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+       if (zio->io_error == 0) {
+               dr->dt.dl.dr_overridden_by = *zio->io_bp;
+               dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+               dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+               if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+                       BP_ZERO(&dr->dt.dl.dr_overridden_by);
+       } else {
+               dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+       }
        cv_broadcast(&db->db_changed);
        mutex_exit(&db->db_mtx);
 
-       if (done)
-               done(&(db->db), in->arg);
+       dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
-       kmem_free(in, sizeof (dmu_sync_arg_t));
+       kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+       blkptr_t *bp = zio->io_bp;
+       dmu_sync_arg_t *dsa = zio->io_private;
+
+       if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+               ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+               ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+               zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+       }
+
+       dmu_tx_commit(dsa->dsa_tx);
+
+       dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+       kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+    zio_prop_t *zp, zbookmark_t *zb)
+{
+       dmu_sync_arg_t *dsa;
+       dmu_tx_t *tx;
+
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+       if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+               dmu_tx_abort(tx);
+               return (EIO);   /* Make zl_get_data do txg_waited_synced() */
+       }
+
+       dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+       dsa->dsa_dr = NULL;
+       dsa->dsa_done = done;
+       dsa->dsa_zgd = zgd;
+       dsa->dsa_tx = tx;
+
+       zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+           zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+           dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+       return (0);
 }
 
 /*
@@ -919,156 +1260,108 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
  *     EALREADY: this block is already in the process of being synced.
  *             The caller should track its progress (somehow).
  *
- *     EINPROGRESS: the IO has been initiated.
- *             The caller should log this blkptr in the callback.
+ *     EIO: could not do the I/O.
+ *             The caller should do a txg_wait_synced().
  *
- *     0: completed.  Sets *bp to the blkptr just written.
- *             The caller should log this blkptr immediately.
+ *     0: the I/O has been initiated.
+ *             The caller should log this blkptr in the done callback.
+ *             It is possible that the I/O will fail, in which case
+ *             the error will be reported to the done callback and
+ *             propagated to pio from zio_done().
  */
 int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
-    blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
-       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-       objset_impl_t *os = db->db_objset;
-       dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
-       tx_state_t *tx = &dp->dp_tx;
+       blkptr_t *bp = zgd->zgd_bp;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+       objset_t *os = db->db_objset;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
        dbuf_dirty_record_t *dr;
-       dmu_sync_arg_t *in;
+       dmu_sync_arg_t *dsa;
        zbookmark_t zb;
-       writeprops_t wp = { 0 };
-       zio_t *zio;
-       int err;
+       zio_prop_t zp;
 
+       ASSERT(pio != NULL);
        ASSERT(BP_IS_HOLE(bp));
        ASSERT(txg != 0);
 
-       dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
-           txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+       SET_BOOKMARK(&zb, ds->ds_object,
+           db->db.db_object, db->db_level, db->db_blkid);
+
+       dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
 
        /*
-        * XXX - would be nice if we could do this without suspending...
+        * If we're frozen (running ziltest), we always need to generate a bp.
         */
-       txg_suspend(dp);
+       if (txg > spa_freeze_txg(os->os_spa))
+               return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
        /*
-        * If this txg already synced, there's nothing to do.
+        * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+        * and us.  If we determine that this txg is not yet syncing,
+        * but it begins to sync a moment later, that's OK because the
+        * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
         */
-       if (txg <= tx->tx_synced_txg) {
-               txg_resume(dp);
+       mutex_enter(&db->db_mtx);
+
+       if (txg <= spa_last_synced_txg(os->os_spa)) {
                /*
-                * If we're running ziltest, we need the blkptr regardless.
+                * This txg has already synced.  There's nothing to do.
                 */
-               if (txg > spa_freeze_txg(dp->dp_spa)) {
-                       /* if db_blkptr == NULL, this was an empty write */
-                       if (db->db_blkptr)
-                               *bp = *db->db_blkptr; /* structure assignment */
-                       return (0);
-               }
+               mutex_exit(&db->db_mtx);
                return (EEXIST);
        }
 
-       mutex_enter(&db->db_mtx);
-
-       if (txg == tx->tx_syncing_txg) {
-               while (db->db_data_pending) {
-                       /*
-                        * IO is in-progress.  Wait for it to finish.
-                        * XXX - would be nice to be able to somehow "attach"
-                        * this zio to the parent zio passed in.
-                        */
-                       cv_wait(&db->db_changed, &db->db_mtx);
-                       if (!db->db_data_pending &&
-                           db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
-                               /*
-                                * IO was compressed away
-                                */
-                               *bp = *db->db_blkptr; /* structure assignment */
-                               mutex_exit(&db->db_mtx);
-                               txg_resume(dp);
-                               return (0);
-                       }
-                       ASSERT(db->db_data_pending ||
-                           (db->db_blkptr && db->db_blkptr->blk_birth == txg));
-               }
-
-               if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
-                       /*
-                        * IO is already completed.
-                        */
-                       *bp = *db->db_blkptr; /* structure assignment */
-                       mutex_exit(&db->db_mtx);
-                       txg_resume(dp);
-                       return (0);
-               }
+       if (txg <= spa_syncing_txg(os->os_spa)) {
+               /*
+                * This txg is currently syncing, so we can't mess with
+                * the dirty record anymore; just write a new log block.
+                */
+               mutex_exit(&db->db_mtx);
+               return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
        }
 
        dr = db->db_last_dirty;
-       while (dr && dr->dr_txg > txg)
+       while (dr && dr->dr_txg != txg)
                dr = dr->dr_next;
-       if (dr == NULL || dr->dr_txg < txg) {
+
+       if (dr == NULL) {
                /*
-                * This dbuf isn't dirty, must have been free_range'd.
+                * There's no dr for this dbuf, so it must have been freed.
                 * There's no need to log writes to freed blocks, so we're done.
                 */
                mutex_exit(&db->db_mtx);
-               txg_resume(dp);
                return (ENOENT);
        }
 
        ASSERT(dr->dr_txg == txg);
-       if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
-               /*
-                * We have already issued a sync write for this buffer.
-                */
-               mutex_exit(&db->db_mtx);
-               txg_resume(dp);
-               return (EALREADY);
-       } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+       if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+           dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
                /*
-                * This buffer has already been synced.  It could not
+                * We have already issued a sync write for this buffer,
+                * or this buffer has already been synced.  It could not
                 * have been dirtied since, or we would have cleared the state.
                 */
-               *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
                mutex_exit(&db->db_mtx);
-               txg_resume(dp);
-               return (0);
+               return (EALREADY);
        }
 
+       ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
        dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
-       in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
-       in->dr = dr;
-       in->done = done;
-       in->arg = arg;
        mutex_exit(&db->db_mtx);
-       txg_resume(dp);
-
-       zb.zb_objset = os->os_dsl_dataset->ds_object;
-       zb.zb_object = db->db.db_object;
-       zb.zb_level = db->db_level;
-       zb.zb_blkid = db->db_blkid;
 
-       wp.wp_type = db->db_dnode->dn_type;
-       wp.wp_level = db->db_level;
-       wp.wp_copies = os->os_copies;
-       wp.wp_dnchecksum = db->db_dnode->dn_checksum;
-       wp.wp_oschecksum = os->os_checksum;
-       wp.wp_dncompress = db->db_dnode->dn_compress;
-       wp.wp_oscompress = os->os_compress;
+       dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+       dsa->dsa_dr = dr;
+       dsa->dsa_done = done;
+       dsa->dsa_zgd = zgd;
+       dsa->dsa_tx = NULL;
 
-       ASSERT(BP_IS_HOLE(bp));
+       zio_nowait(arc_write(pio, os->os_spa, txg,
+           bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
+           dmu_sync_ready, dmu_sync_done, dsa,
+           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
-       zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
-           txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
-           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-       if (pio) {
-               zio_nowait(zio);
-               err = EINPROGRESS;
-       } else {
-               err = zio_wait(zio);
-               ASSERT(err == 0);
-       }
-       return (err);
+       return (0);
 }
 
 int
@@ -1078,7 +1371,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
        dnode_t *dn;
        int err;
 
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err)
                return (err);
        err = dnode_set_blksz(dn, size, ibs, tx);
@@ -1093,7 +1386,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
        dnode_t *dn;
 
        /* XXX assumes dnode_hold will not get an i/o error */
-       (void) dnode_hold(os->os, object, FTAG, &dn);
+       (void) dnode_hold(os, object, FTAG, &dn);
        ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
        dn->dn_checksum = checksum;
        dnode_setdirty(dn, tx);
@@ -1107,20 +1400,98 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
        dnode_t *dn;
 
        /* XXX assumes dnode_hold will not get an i/o error */
-       (void) dnode_hold(os->os, object, FTAG, &dn);
+       (void) dnode_hold(os, object, FTAG, &dn);
        ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
        dn->dn_compress = compress;
        dnode_setdirty(dn, tx);
        dnode_rele(dn, FTAG);
 }
 
+int zfs_mdcomp_disable = 0;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+       dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+       boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+       enum zio_checksum checksum = os->os_checksum;
+       enum zio_compress compress = os->os_compress;
+       enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+       boolean_t dedup;
+       boolean_t dedup_verify = os->os_dedup_verify;
+       int copies = os->os_copies;
+
+       /*
+        * Determine checksum setting.
+        */
+       if (ismd) {
+               /*
+                * Metadata always gets checksummed.  If the data
+                * checksum is multi-bit correctable, and it's not a
+                * ZBT-style checksum, then it's suitable for metadata
+                * as well.  Otherwise, the metadata checksum defaults
+                * to fletcher4.
+                */
+               if (zio_checksum_table[checksum].ci_correctable < 1 ||
+                   zio_checksum_table[checksum].ci_eck)
+                       checksum = ZIO_CHECKSUM_FLETCHER_4;
+       } else {
+               checksum = zio_checksum_select(dn->dn_checksum, checksum);
+       }
+
+       /*
+        * Determine compression setting.
+        */
+       if (ismd) {
+               /*
+                * XXX -- we should design a compression algorithm
+                * that specializes in arrays of bps.
+                */
+               compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+                   ZIO_COMPRESS_LZJB;
+       } else {
+               compress = zio_compress_select(dn->dn_compress, compress);
+       }
+
+       /*
+        * Determine dedup setting.  If we are in dmu_sync(), we won't
+        * actually dedup now because that's all done in syncing context;
+        * but we do want to use the dedup checkum.  If the checksum is not
+        * strong enough to ensure unique signatures, force dedup_verify.
+        */
+       dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
+       if (dedup) {
+               checksum = dedup_checksum;
+               if (!zio_checksum_table[checksum].ci_dedup)
+                       dedup_verify = 1;
+       }
+
+       if (wp & WP_DMU_SYNC)
+               dedup = 0;
+
+       if (wp & WP_NOFILL) {
+               ASSERT(!ismd && level == 0);
+               checksum = ZIO_CHECKSUM_OFF;
+               compress = ZIO_COMPRESS_OFF;
+               dedup = B_FALSE;
+       }
+
+       zp->zp_checksum = checksum;
+       zp->zp_compress = compress;
+       zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+       zp->zp_level = level;
+       zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+       zp->zp_dedup = dedup;
+       zp->zp_dedup_verify = dedup && dedup_verify;
+}
+
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
        dnode_t *dn;
        int i, err;
 
-       err = dnode_hold(os->os, object, FTAG, &dn);
+       err = dnode_hold(os, object, FTAG, &dn);
        if (err)
                return (err);
        /*
@@ -1134,7 +1505,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
        if (i != TXG_SIZE) {
                dnode_rele(dn, FTAG);
                txg_wait_synced(dmu_objset_pool(os), 0);
-               err = dnode_hold(os->os, object, FTAG, &dn);
+               err = dnode_hold(os, object, FTAG, &dn);
                if (err)
                        return (err);
        }
@@ -1148,21 +1519,27 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
+       dnode_phys_t *dnp;
+
        rw_enter(&dn->dn_struct_rwlock, RW_READER);
        mutex_enter(&dn->dn_mtx);
 
+       dnp = dn->dn_phys;
+
        doi->doi_data_block_size = dn->dn_datablksz;
        doi->doi_metadata_block_size = dn->dn_indblkshift ?
            1ULL << dn->dn_indblkshift : 0;
+       doi->doi_type = dn->dn_type;
+       doi->doi_bonus_type = dn->dn_bonustype;
+       doi->doi_bonus_size = dn->dn_bonuslen;
        doi->doi_indirection = dn->dn_nlevels;
        doi->doi_checksum = dn->dn_checksum;
        doi->doi_compress = dn->dn_compress;
-       doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
-           SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
-       doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
-       doi->doi_type = dn->dn_type;
-       doi->doi_bonus_size = dn->dn_bonuslen;
-       doi->doi_bonus_type = dn->dn_bonustype;
+       doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+       doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+       doi->doi_fill_count = 0;
+       for (int i = 0; i < dnp->dn_nblkptr; i++)
+               doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
 
        mutex_exit(&dn->dn_mtx);
        rw_exit(&dn->dn_struct_rwlock);
@@ -1176,7 +1553,7 @@ int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
        dnode_t *dn;
-       int err = dnode_hold(os->os, object, FTAG, &dn);
+       int err = dnode_hold(os, object, FTAG, &dn);
 
        if (err)
                return (err);
@@ -1260,17 +1637,25 @@ byteswap_uint8_array(void *vbuf, size_t size)
 void
 dmu_init(void)
 {
+       zfs_dbgmsg_init();
        dbuf_init();
        dnode_init();
+       zfetch_init();
        arc_init();
        l2arc_init();
+       xuio_stat_init();
+       sa_cache_init();
 }
 
 void
 dmu_fini(void)
 {
        arc_fini();
+       zfetch_fini();
        dnode_fini();
        dbuf_fini();
        l2arc_fini();
+       xuio_stat_fini();
+       sa_cache_fini();
+       zfs_dbgmsg_fini();
 }
index 1f91fc1..98228d4 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -32,16 +31,15 @@ uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-       objset_impl_t *osi = os->os;
        uint64_t object;
        uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-           (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+           (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
        dnode_t *dn = NULL;
        int restarted = B_FALSE;
 
-       mutex_enter(&osi->os_obj_lock);
+       mutex_enter(&os->os_obj_lock);
        for (;;) {
-               object = osi->os_obj_next;
+               object = os->os_obj_next;
                /*
                 * Each time we polish off an L2 bp worth of dnodes
                 * (2^13 objects), move to another L2 bp that's still
@@ -51,14 +49,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
                 */
                if (P2PHASE(object, L2_dnode_count) == 0) {
                        uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-                       int error = dnode_next_offset(osi->os_meta_dnode,
+                       int error = dnode_next_offset(os->os_meta_dnode,
                            DNODE_FIND_HOLE,
                            &offset, 2, DNODES_PER_BLOCK >> 2, 0);
                        restarted = B_TRUE;
                        if (error == 0)
                                object = offset >> DNODE_SHIFT;
                }
-               osi->os_obj_next = ++object;
+               os->os_obj_next = ++object;
 
                /*
                 * XXX We should check for an i/o error here and return
@@ -66,19 +64,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
                 * dmu_tx_assign(), but there is currently no mechanism
                 * to do so.
                 */
-               (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+               (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
                    FTAG, &dn);
                if (dn)
                        break;
 
                if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
-                       osi->os_obj_next = object - 1;
+                       os->os_obj_next = object - 1;
        }
 
        dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
        dnode_rele(dn, FTAG);
 
-       mutex_exit(&osi->os_obj_lock);
+       mutex_exit(&os->os_obj_lock);
 
        dmu_tx_add_new_object(tx, os, object);
        return (object);
@@ -94,7 +92,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
        if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
                return (EBADF);
 
-       err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+       err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
        if (err)
                return (err);
        dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
@@ -116,7 +114,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
        if (object == DMU_META_DNODE_OBJECT)
                return (EBADF);
 
-       err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+       err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
            FTAG, &dn);
        if (err)
                return (err);
@@ -128,7 +126,11 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
                return (0);
        }
 
-       nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       if (bonustype == DMU_OT_SA) {
+               nblkptr = 1;
+       } else {
+               nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       }
 
        /*
         * If we are losing blkptrs or changing the block size this must
@@ -166,7 +168,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 
        ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 
-       err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+       err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
            FTAG, &dn);
        if (err)
                return (err);
@@ -185,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
        uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
        int error;
 
-       error = dnode_next_offset(os->os->os_meta_dnode,
+       error = dnode_next_offset(os->os_meta_dnode,
            (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
        *objectp = offset >> DNODE_SHIFT;
index 5a9d25b..690e6ec 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/sunddi.h>
+#include <sys/sa.h>
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
-       return (os->os->os_spa);
+       return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
-       return (os->os->os_zil);
+       return (os->os_zil);
 }
 
 dsl_pool_t *
@@ -59,82 +61,112 @@ dmu_objset_pool(objset_t *os)
 {
        dsl_dataset_t *ds;
 
-       if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+       if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
                return (ds->ds_dir->dd_pool);
        else
-               return (spa_get_dsl(os->os->os_spa));
+               return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
-       return (os->os->os_dsl_dataset);
+       return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
-       return (os->os->os_phys->os_type);
+       return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
-       dsl_dataset_name(os->os->os_dsl_dataset, buf);
+       dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
-       dsl_dataset_t *ds = os->os->os_dsl_dataset;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
 
        return (ds ? ds->ds_object : 0);
 }
 
+uint64_t
+dmu_objset_syncprop(objset_t *os)
+{
+       return (os->os_sync);
+}
+
+uint64_t
+dmu_objset_logbias(objset_t *os)
+{
+       return (os->os_logbias);
+}
+
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
-       objset_impl_t *osi = arg;
+       objset_t *os = arg;
 
        /*
         * Inheritance should have been done by now.
         */
        ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
-       osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+       os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
-       objset_impl_t *osi = arg;
+       objset_t *os = arg;
 
        /*
         * Inheritance and range checking should have been done by now.
         */
        ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
-       osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+       os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
-       objset_impl_t *osi = arg;
+       objset_t *os = arg;
 
        /*
         * Inheritance and range checking should have been done by now.
         */
        ASSERT(newval > 0);
-       ASSERT(newval <= spa_max_replication(osi->os_spa));
+       ASSERT(newval <= spa_max_replication(os->os_spa));
 
-       osi->os_copies = newval;
+       os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+       objset_t *os = arg;
+       spa_t *spa = os->os_spa;
+       enum zio_checksum checksum;
+
+       /*
+        * Inheritance should have been done by now.
+        */
+       ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+       checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+       os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+       os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
-       objset_impl_t *osi = arg;
+       objset_t *os = arg;
 
        /*
         * Inheritance and range checking should have been done by now.
@@ -142,13 +174,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval)
        ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
            newval == ZFS_CACHE_METADATA);
 
-       osi->os_primary_cache = newval;
+       os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
-       objset_impl_t *osi = arg;
+       objset_t *os = arg;
 
        /*
         * Inheritance and range checking should have been done by now.
@@ -156,7 +188,35 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
        ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
            newval == ZFS_CACHE_METADATA);
 
-       osi->os_secondary_cache = newval;
+       os->os_secondary_cache = newval;
+}
+
+static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+       objset_t *os = arg;
+
+       /*
+        * Inheritance and range checking should have been done by now.
+        */
+       ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+           newval == ZFS_SYNC_DISABLED);
+
+       os->os_sync = newval;
+       if (os->os_zil)
+               zil_set_sync(os->os_zil, newval);
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+       objset_t *os = arg;
+
+       ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+           newval == ZFS_LOGBIAS_THROUGHPUT);
+       os->os_logbias = newval;
+       if (os->os_zil)
+               zil_set_logbias(os->os_zil, newval);
 }
 
 void
@@ -177,39 +237,37 @@ dmu_objset_byteswap(void *buf, size_t size)
 
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
-    objset_impl_t **osip)
+    objset_t **osp)
 {
-       objset_impl_t *osi;
+       objset_t *os;
        int i, err;
 
        ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
-       osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
-       osi->os.os = osi;
-       osi->os_dsl_dataset = ds;
-       osi->os_spa = spa;
-       osi->os_rootbp = bp;
-       if (!BP_IS_HOLE(osi->os_rootbp)) {
+       os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+       os->os_dsl_dataset = ds;
+       os->os_spa = spa;
+       os->os_rootbp = bp;
+       if (!BP_IS_HOLE(os->os_rootbp)) {
                uint32_t aflags = ARC_WAIT;
                zbookmark_t zb;
-               zb.zb_objset = ds ? ds->ds_object : 0;
-               zb.zb_object = 0;
-               zb.zb_level = -1;
-               zb.zb_blkid = 0;
-               if (DMU_OS_IS_L2CACHEABLE(osi))
+               SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+                   ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+               if (DMU_OS_IS_L2CACHEABLE(os))
                        aflags |= ARC_L2CACHE;
 
-               dprintf_bp(osi->os_rootbp, "reading %s", "");
+               dprintf_bp(os->os_rootbp, "reading %s", "");
                /*
-                * NB: when bprewrite scrub can change the bp,
+                * XXX when bprewrite scrub can change the bp,
                 * and this is called from dmu_objset_open_ds_os, the bp
                 * could change, and we'll need a lock.
                 */
-               err = arc_read_nolock(NULL, spa, osi->os_rootbp,
-                   arc_getbuf_func, &osi->os_phys_buf,
+               err = dsl_read_nolock(NULL, spa, os->os_rootbp,
+                   arc_getbuf_func, &os->os_phys_buf,
                    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
                if (err) {
-                       kmem_free(osi, sizeof (objset_impl_t));
+                       kmem_free(os, sizeof (objset_t));
                        /* convert checksum errors into IO errors */
                        if (err == ECKSUM)
                                err = EIO;
@@ -218,27 +276,27 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 
                /* Increase the blocksize if we are permitted. */
                if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
-                   arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) {
+                   arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
                        arc_buf_t *buf = arc_buf_alloc(spa,
-                           sizeof (objset_phys_t), &osi->os_phys_buf,
+                           sizeof (objset_phys_t), &os->os_phys_buf,
                            ARC_BUFC_METADATA);
                        bzero(buf->b_data, sizeof (objset_phys_t));
-                       bcopy(osi->os_phys_buf->b_data, buf->b_data,
-                           arc_buf_size(osi->os_phys_buf));
-                       (void) arc_buf_remove_ref(osi->os_phys_buf,
-                           &osi->os_phys_buf);
-                       osi->os_phys_buf = buf;
+                       bcopy(os->os_phys_buf->b_data, buf->b_data,
+                           arc_buf_size(os->os_phys_buf));
+                       (void) arc_buf_remove_ref(os->os_phys_buf,
+                           &os->os_phys_buf);
+                       os->os_phys_buf = buf;
                }
 
-               osi->os_phys = osi->os_phys_buf->b_data;
-               osi->os_flags = osi->os_phys->os_flags;
+               os->os_phys = os->os_phys_buf->b_data;
+               os->os_flags = os->os_phys->os_flags;
        } else {
                int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
                    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
-               osi->os_phys_buf = arc_buf_alloc(spa, size,
-                   &osi->os_phys_buf, ARC_BUFC_METADATA);
-               osi->os_phys = osi->os_phys_buf->b_data;
-               bzero(osi->os_phys, size);
+               os->os_phys_buf = arc_buf_alloc(spa, size,
+                   &os->os_phys_buf, ARC_BUFC_METADATA);
+               os->os_phys = os->os_phys_buf->b_data;
+               bzero(os->os_phys, size);
        }
 
        /*
@@ -249,61 +307,74 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
         */
        if (ds) {
                err = dsl_prop_register(ds, "primarycache",
-                   primary_cache_changed_cb, osi);
+                   primary_cache_changed_cb, os);
                if (err == 0)
                        err = dsl_prop_register(ds, "secondarycache",
-                           secondary_cache_changed_cb, osi);
+                           secondary_cache_changed_cb, os);
                if (!dsl_dataset_is_snapshot(ds)) {
                        if (err == 0)
                                err = dsl_prop_register(ds, "checksum",
-                                   checksum_changed_cb, osi);
+                                   checksum_changed_cb, os);
                        if (err == 0)
                                err = dsl_prop_register(ds, "compression",
-                                   compression_changed_cb, osi);
+                                   compression_changed_cb, os);
                        if (err == 0)
                                err = dsl_prop_register(ds, "copies",
-                                   copies_changed_cb, osi);
+                                   copies_changed_cb, os);
+                       if (err == 0)
+                               err = dsl_prop_register(ds, "dedup",
+                                   dedup_changed_cb, os);
+                       if (err == 0)
+                               err = dsl_prop_register(ds, "logbias",
+                                   logbias_changed_cb, os);
+                       if (err == 0)
+                               err = dsl_prop_register(ds, "sync",
+                                   sync_changed_cb, os);
                }
                if (err) {
-                       VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
-                           &osi->os_phys_buf) == 1);
-                       kmem_free(osi, sizeof (objset_impl_t));
+                       VERIFY(arc_buf_remove_ref(os->os_phys_buf,
+                           &os->os_phys_buf) == 1);
+                       kmem_free(os, sizeof (objset_t));
                        return (err);
                }
        } else if (ds == NULL) {
                /* It's the meta-objset. */
-               osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-               osi->os_compress = ZIO_COMPRESS_LZJB;
-               osi->os_copies = spa_max_replication(spa);
-               osi->os_primary_cache = ZFS_CACHE_ALL;
-               osi->os_secondary_cache = ZFS_CACHE_ALL;
+               os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+               os->os_compress = ZIO_COMPRESS_LZJB;
+               os->os_copies = spa_max_replication(spa);
+               os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+               os->os_dedup_verify = 0;
+               os->os_logbias = 0;
+               os->os_sync = 0;
+               os->os_primary_cache = ZFS_CACHE_ALL;
+               os->os_secondary_cache = ZFS_CACHE_ALL;
        }
 
-       osi->os_zil_header = osi->os_phys->os_zil_header;
-       osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
+       os->os_zil_header = os->os_phys->os_zil_header;
+       os->os_zil = zil_alloc(os, &os->os_zil_header);
 
        for (i = 0; i < TXG_SIZE; i++) {
-               list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+               list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
                    offsetof(dnode_t, dn_dirty_link[i]));
-               list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+               list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
                    offsetof(dnode_t, dn_dirty_link[i]));
        }
-       list_create(&osi->os_dnodes, sizeof (dnode_t),
+       list_create(&os->os_dnodes, sizeof (dnode_t),
            offsetof(dnode_t, dn_link));
-       list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+       list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
-       mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
-
-       osi->os_meta_dnode = dnode_special_open(osi,
-           &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
-       if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) {
-               osi->os_userused_dnode = dnode_special_open(osi,
-                   &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
-               osi->os_groupused_dnode = dnode_special_open(osi,
-                   &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+       mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+       os->os_meta_dnode = dnode_special_open(os,
+           &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+       if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
+               os->os_userused_dnode = dnode_special_open(os,
+                   &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
+               os->os_groupused_dnode = dnode_special_open(os,
+                   &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
        }
 
        /*
@@ -311,117 +382,96 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
         * have ds_opening_lock
         */
        if (ds) {
-               VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
-                   dmu_objset_evict));
+               mutex_enter(&ds->ds_lock);
+               ASSERT(ds->ds_objset == NULL);
+               ds->ds_objset = os;
+               mutex_exit(&ds->ds_lock);
        }
 
-       *osip = osi;
+       *osp = os;
        return (0);
 }
 
-static int
-dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
-       objset_impl_t *osi;
+       int err = 0;
 
        mutex_enter(&ds->ds_opening_lock);
-       osi = dsl_dataset_get_user_ptr(ds);
-       if (osi == NULL) {
-               int err;
-
+       *osp = ds->ds_objset;
+       if (*osp == NULL) {
                err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-                   ds, &ds->ds_phys->ds_bp, &osi);
-               if (err) {
-                       mutex_exit(&ds->ds_opening_lock);
-                       return (err);
-               }
+                   ds, &ds->ds_phys->ds_bp, osp);
        }
        mutex_exit(&ds->ds_opening_lock);
-
-       os->os = osi;
-       os->os_mode = DS_MODE_NOHOLD;
-
-       if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
-               return (EINVAL);
-       return (0);
+       return (err);
 }
 
+/* called from zpl */
 int
-dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
-       objset_t *os;
+       dsl_dataset_t *ds;
        int err;
 
-       os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-       err = dmu_objset_open_ds_os(ds, os, type);
+       err = dsl_dataset_hold(name, tag, &ds);
        if (err)
-               kmem_free(os, sizeof (objset_t));
-       else
-               *osp = os;
+               return (err);
+
+       err = dmu_objset_from_ds(ds, osp);
+       if (err)
+               dsl_dataset_rele(ds, tag);
+
        return (err);
 }
 
 /* called from zpl */
 int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp)
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
 {
-       objset_t *os;
        dsl_dataset_t *ds;
        int err;
 
-       ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
-           DS_MODE_TYPE(mode) == DS_MODE_OWNER);
-
-       os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-       if (DS_MODE_TYPE(mode) == DS_MODE_USER)
-               err = dsl_dataset_hold(name, os, &ds);
-       else
-               err = dsl_dataset_own(name, mode, os, &ds);
-       if (err) {
-               kmem_free(os, sizeof (objset_t));
+       err = dsl_dataset_own(name, B_FALSE, tag, &ds);
+       if (err)
                return (err);
-       }
 
-       err = dmu_objset_open_ds_os(ds, os, type);
+       err = dmu_objset_from_ds(ds, osp);
        if (err) {
-               if (DS_MODE_TYPE(mode) == DS_MODE_USER)
-                       dsl_dataset_rele(ds, os);
-               else
-                       dsl_dataset_disown(ds, os);
-               kmem_free(os, sizeof (objset_t));
-       } else {
-               os->os_mode = mode;
-               *osp = os;
+               dsl_dataset_disown(ds, tag);
+       } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+               dmu_objset_disown(*osp, tag);
+               return (EINVAL);
+       } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+               dmu_objset_disown(*osp, tag);
+               return (EROFS);
        }
        return (err);
 }
 
 void
-dmu_objset_close(objset_t *os)
+dmu_objset_rele(objset_t *os, void *tag)
 {
-       ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
-           DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
-           DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
+       dsl_dataset_rele(os->os_dsl_dataset, tag);
+}
 
-       if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
-               dsl_dataset_rele(os->os->os_dsl_dataset, os);
-       else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
-               dsl_dataset_disown(os->os->os_dsl_dataset, os);
-       kmem_free(os, sizeof (objset_t));
+void
+dmu_objset_disown(objset_t *os, void *tag)
+{
+       dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
 int
 dmu_objset_evict_dbufs(objset_t *os)
 {
-       objset_impl_t *osi = os->os;
        dnode_t *dn;
 
-       mutex_enter(&osi->os_lock);
+       mutex_enter(&os->os_lock);
 
        /* process the mdn last, since the other dnodes have holds on it */
-       list_remove(&osi->os_dnodes, osi->os_meta_dnode);
-       list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+       list_remove(&os->os_dnodes, os->os_meta_dnode);
+       list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
 
        /*
         * Find the first dnode with holds.  We have to do this dance
@@ -429,93 +479,103 @@ dmu_objset_evict_dbufs(objset_t *os)
         * hold.  If there are no holds then it has no dbufs so OK to
         * skip.
         */
-       for (dn = list_head(&osi->os_dnodes);
+       for (dn = list_head(&os->os_dnodes);
            dn && !dnode_add_ref(dn, FTAG);
-           dn = list_next(&osi->os_dnodes, dn))
+           dn = list_next(&os->os_dnodes, dn))
                continue;
 
        while (dn) {
                dnode_t *next_dn = dn;
 
                do {
-                       next_dn = list_next(&osi->os_dnodes, next_dn);
+                       next_dn = list_next(&os->os_dnodes, next_dn);
                } while (next_dn && !dnode_add_ref(next_dn, FTAG));
 
-               mutex_exit(&osi->os_lock);
+               mutex_exit(&os->os_lock);
                dnode_evict_dbufs(dn);
                dnode_rele(dn, FTAG);
-               mutex_enter(&osi->os_lock);
+               mutex_enter(&os->os_lock);
                dn = next_dn;
        }
-       mutex_exit(&osi->os_lock);
-       return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
+       mutex_exit(&os->os_lock);
+       return (list_head(&os->os_dnodes) != os->os_meta_dnode);
 }
 
 void
-dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+dmu_objset_evict(objset_t *os)
 {
-       objset_impl_t *osi = arg;
-       objset_t os;
-       int i;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
 
-       for (i = 0; i < TXG_SIZE; i++) {
-               ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
-               ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
-       }
+       for (int t = 0; t < TXG_SIZE; t++)
+               ASSERT(!dmu_objset_is_dirty(os, t));
 
        if (ds) {
                if (!dsl_dataset_is_snapshot(ds)) {
                        VERIFY(0 == dsl_prop_unregister(ds, "checksum",
-                           checksum_changed_cb, osi));
+                           checksum_changed_cb, os));
                        VERIFY(0 == dsl_prop_unregister(ds, "compression",
-                           compression_changed_cb, osi));
+                           compression_changed_cb, os));
                        VERIFY(0 == dsl_prop_unregister(ds, "copies",
-                           copies_changed_cb, osi));
+                           copies_changed_cb, os));
+                       VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+                           dedup_changed_cb, os));
+                       VERIFY(0 == dsl_prop_unregister(ds, "logbias",
+                           logbias_changed_cb, os));
+                       VERIFY(0 == dsl_prop_unregister(ds, "sync",
+                           sync_changed_cb, os));
                }
                VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
-                   primary_cache_changed_cb, osi));
+                   primary_cache_changed_cb, os));
                VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
-                   secondary_cache_changed_cb, osi));
+                   secondary_cache_changed_cb, os));
        }
 
+       if (os->os_sa)
+               sa_tear_down(os);
+
        /*
         * We should need only a single pass over the dnode list, since
         * nothing can be added to the list at this point.
         */
-       os.os = osi;
-       (void) dmu_objset_evict_dbufs(&os);
+       (void) dmu_objset_evict_dbufs(os);
 
-       dnode_special_close(osi->os_meta_dnode);
-       if (osi->os_userused_dnode) {
-               dnode_special_close(osi->os_userused_dnode);
-               dnode_special_close(osi->os_groupused_dnode);
+       dnode_special_close(os->os_meta_dnode);
+       if (os->os_userused_dnode) {
+               dnode_special_close(os->os_userused_dnode);
+               dnode_special_close(os->os_groupused_dnode);
        }
-       zil_free(osi->os_zil);
+       zil_free(os->os_zil);
+
+       ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
-       ASSERT3P(list_head(&osi->os_dnodes), ==, NULL);
+       VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+       mutex_destroy(&os->os_lock);
+       mutex_destroy(&os->os_obj_lock);
+       mutex_destroy(&os->os_user_ptr_lock);
+       kmem_free(os, sizeof (objset_t));
+}
 
-       VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
-       mutex_destroy(&osi->os_lock);
-       mutex_destroy(&osi->os_obj_lock);
-       mutex_destroy(&osi->os_user_ptr_lock);
-       kmem_free(osi, sizeof (objset_impl_t));
+timestruc_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+       return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 /* called from dsl for meta-objset */
-objset_impl_t *
+objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
-       objset_impl_t *osi;
+       objset_t *os;
        dnode_t *mdn;
 
        ASSERT(dmu_tx_is_syncing(tx));
        if (ds)
                mutex_enter(&ds->ds_opening_lock);
-       VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+       VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
        if (ds)
                mutex_exit(&ds->ds_opening_lock);
-       mdn = osi->os_meta_dnode;
+       mdn = os->os_meta_dnode;
 
        dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
            DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -550,24 +610,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        ASSERT(type != DMU_OST_NONE);
        ASSERT(type != DMU_OST_ANY);
        ASSERT(type < DMU_OST_NUMTYPES);
-       osi->os_phys->os_type = type;
-       if (dmu_objset_userused_enabled(osi)) {
-               osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
-               osi->os_flags = osi->os_phys->os_flags;
+       os->os_phys->os_type = type;
+       if (dmu_objset_userused_enabled(os)) {
+               os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+               os->os_flags = os->os_phys->os_flags;
        }
 
        dsl_dataset_dirty(ds, tx);
 
-       return (osi);
+       return (os);
 }
 
 struct oscarg {
        void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
        void *userarg;
-       dsl_dataset_t *clone_parent;
+       dsl_dataset_t *clone_origin;
        const char *lastname;
        dmu_objset_type_t type;
        uint64_t flags;
+       cred_t *cr;
 };
 
 /*ARGSUSED*/
@@ -585,17 +646,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (err != ENOENT)
                return (err ? err : EEXIST);
 
-       if (oa->clone_parent != NULL) {
-               /*
-                * You can't clone across pools.
-                */
-               if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+       if (oa->clone_origin != NULL) {
+               /* You can't clone across pools. */
+               if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
                        return (EXDEV);
 
-               /*
-                * You can only clone snapshots, not the head datasets.
-                */
-               if (oa->clone_parent->ds_phys->ds_num_children == 0)
+               /* You can only clone snapshots, not the head datasets. */
+               if (!dsl_dataset_is_snapshot(oa->clone_origin))
                        return (EINVAL);
        }
 
@@ -603,41 +660,41 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
        struct oscarg *oa = arg2;
-       dsl_dataset_t *ds;
-       blkptr_t *bp;
        uint64_t dsobj;
 
        ASSERT(dmu_tx_is_syncing(tx));
 
        dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-           oa->clone_parent, oa->flags, cr, tx);
+           oa->clone_origin, oa->flags, oa->cr, tx);
 
-       VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
-       bp = dsl_dataset_get_blkptr(ds);
-       if (BP_IS_HOLE(bp)) {
-               objset_impl_t *osi;
+       if (oa->clone_origin == NULL) {
+               dsl_dataset_t *ds;
+               blkptr_t *bp;
+               objset_t *os;
 
-               /* This is an empty dmu_objset; not a clone. */
-               osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+               VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj,
+                   FTAG, &ds));
+               bp = dsl_dataset_get_blkptr(ds);
+               ASSERT(BP_IS_HOLE(bp));
+
+               os = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
                    ds, bp, oa->type, tx);
 
                if (oa->userfunc)
-                       oa->userfunc(&osi->os, oa->userarg, cr, tx);
+                       oa->userfunc(os, oa->userarg, oa->cr, tx);
+               dsl_dataset_rele(ds, FTAG);
        }
 
-       spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
-           tx, cr, "dataset = %llu", dsobj);
-
-       dsl_dataset_rele(ds, FTAG);
+       spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+           tx, "dataset = %llu", dsobj);
 }
 
 int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
        dsl_dir_t *pdd;
@@ -654,24 +711,13 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
                return (EEXIST);
        }
 
-       dprintf("name=%s\n", name);
-
        oa.userfunc = func;
        oa.userarg = arg;
        oa.lastname = tail;
        oa.type = type;
        oa.flags = flags;
+       oa.cr = CRED();
 
-       if (clone_parent != NULL) {
-               /*
-                * You can't clone to a different type.
-                */
-               if (clone_parent->os->os_phys->os_type != type) {
-                       dsl_dir_close(pdd, FTAG);
-                       return (EINVAL);
-               }
-               oa.clone_parent = clone_parent->os->os_dsl_dataset;
-       }
        err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
            dmu_objset_create_sync, pdd, &oa, 5);
        dsl_dir_close(pdd, FTAG);
@@ -679,66 +725,63 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 }
 
 int
-dmu_objset_destroy(const char *name, boolean_t defer)
+dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
 {
-       objset_t *os;
-       int error;
-
-       /*
-        * If it looks like we'll be able to destroy it, and there's
-        * an unplayed replay log sitting around, destroy the log.
-        * It would be nicer to do this in dsl_dataset_destroy_sync(),
-        * but the replay log objset is modified in open context.
-        */
-       error = dmu_objset_open(name, DMU_OST_ANY,
-           DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
-       if (error == 0) {
-               dsl_dataset_t *ds = os->os->os_dsl_dataset;
-               zil_destroy(dmu_objset_zil(os), B_FALSE);
+       dsl_dir_t *pdd;
+       const char *tail;
+       int err = 0;
+       struct oscarg oa = { 0 };
 
-               error = dsl_dataset_destroy(ds, os, defer);
-               /*
-                * dsl_dataset_destroy() closes the ds.
-                */
-               kmem_free(os, sizeof (objset_t));
+       ASSERT(strchr(name, '@') == NULL);
+       err = dsl_dir_open(name, FTAG, &pdd, &tail);
+       if (err)
+               return (err);
+       if (tail == NULL) {
+               dsl_dir_close(pdd, FTAG);
+               return (EEXIST);
        }
 
-       return (error);
+       oa.lastname = tail;
+       oa.clone_origin = clone_origin;
+       oa.flags = flags;
+       oa.cr = CRED();
+
+       err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+           dmu_objset_create_sync, pdd, &oa, 5);
+       dsl_dir_close(pdd, FTAG);
+       return (err);
 }
 
-/*
- * This will close the objset.
- */
 int
-dmu_objset_rollback(objset_t *os)
+dmu_objset_destroy(const char *name, boolean_t defer)
 {
-       int err;
        dsl_dataset_t *ds;
-
-       ds = os->os->os_dsl_dataset;
-
-       if (!dsl_dataset_tryown(ds, TRUE, os)) {
-               dmu_objset_close(os);
-               return (EBUSY);
-       }
-
-       err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
+       int error;
 
        /*
-        * NB: we close the objset manually because the rollback
-        * actually implicitly called dmu_objset_evict(), thus freeing
-        * the objset_impl_t.
+        * dsl_dataset_destroy() can free any claimed-but-unplayed
+        * intent log, but if there is an active log, it has blocks that
+        * are allocated, but may not yet be reflected in the on-disk
+        * structure.  Only the ZIL knows how to free them, so we have
+        * to call into it here.
         */
-       dsl_dataset_disown(ds, os);
-       kmem_free(os, sizeof (objset_t));
-       return (err);
+       error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
+       if (error == 0) {
+               objset_t *os;
+               if (dmu_objset_from_ds(ds, &os) == 0)
+                       zil_destroy(dmu_objset_zil(os), B_FALSE);
+               error = dsl_dataset_destroy(ds, FTAG, defer);
+               /* dsl_dataset_destroy() closes the ds. */
+       }
+
+       return (error);
 }
 
 struct snaparg {
        dsl_sync_task_group_t *dstg;
        char *snapname;
        char failed[MAXPATHLEN];
-       boolean_t checkperms;
+       boolean_t recursive;
        nvlist_t *props;
 };
 
@@ -750,49 +793,68 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
        /* The props have already been checked by zfs_check_userprops(). */
 
-       return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset,
+       return (dsl_dataset_snapshot_check(os->os_dsl_dataset,
            sn->snapname, tx));
 }
 
 static void
-snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        objset_t *os = arg1;
-       dsl_dataset_t *ds = os->os->os_dsl_dataset;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
        struct snaparg *sn = arg2;
 
-       dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+       dsl_dataset_snapshot_sync(ds, sn->snapname, tx);
 
-       if (sn->props)
-               dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx);
+       if (sn->props) {
+               dsl_props_arg_t pa;
+               pa.pa_props = sn->props;
+               pa.pa_source = ZPROP_SRC_LOCAL;
+               dsl_props_set_sync(ds->ds_prev, &pa, tx);
+       }
 }
 
 static int
-dmu_objset_snapshot_one(char *name, void *arg)
+dmu_objset_snapshot_one(const char *name, void *arg)
 {
        struct snaparg *sn = arg;
        objset_t *os;
        int err;
+       char *cp;
+
+       /*
+        * If the objset starts with a '%', then ignore it unless it was
+        * explicitly named (ie, not recursive).  These hidden datasets
+        * are always inconsistent, and by not opening them here, we can
+        * avoid a race with dsl_dir_destroy_check().
+        */
+       cp = strrchr(name, '/');
+       if (cp && cp[1] == '%' && sn->recursive)
+               return (0);
 
        (void) strcpy(sn->failed, name);
 
        /*
-        * Check permissions only when requested.  This only applies when
-        * doing a recursive snapshot.  The permission checks for the starting
-        * dataset have already been performed in zfs_secpolicy_snapshot()
+        * Check permissions if we are doing a recursive snapshot.  The
+        * permission checks for the starting dataset have already been
+        * performed in zfs_secpolicy_snapshot()
         */
-       if (sn->checkperms == B_TRUE &&
-           (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+       if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED())))
                return (err);
 
-       err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
+       err = dmu_objset_hold(name, sn, &os);
        if (err != 0)
                return (err);
 
-       /* If the objset is in an inconsistent state, return busy */
-       if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
-               dmu_objset_close(os);
-               return (EBUSY);
+       /*
+        * If the objset is in an inconsistent state (eg, in the process
+        * of being destroyed), don't snapshot it.  As with %hidden
+        * datasets, we return EBUSY if this name was explicitly
+        * requested (ie, not recursive), and otherwise ignore it.
+        */
+       if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
+               dmu_objset_rele(os, sn);
+               return (sn->recursive ? 0 : EBUSY);
        }
 
        /*
@@ -805,7 +867,7 @@ dmu_objset_snapshot_one(char *name, void *arg)
                dsl_sync_task_create(sn->dstg, snapshot_check,
                    snapshot_sync, os, sn, 3);
        } else {
-               dmu_objset_close(os);
+               dmu_objset_rele(os, sn);
        }
 
        return (err);
@@ -829,13 +891,12 @@ dmu_objset_snapshot(char *fsname, char *snapname,
        sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
        sn.snapname = snapname;
        sn.props = props;
+       sn.recursive = recursive;
 
        if (recursive) {
-               sn.checkperms = B_TRUE;
                err = dmu_objset_find(fsname,
                    dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
        } else {
-               sn.checkperms = B_FALSE;
                err = dmu_objset_snapshot_one(fsname, &sn);
        }
 
@@ -845,11 +906,11 @@ dmu_objset_snapshot(char *fsname, char *snapname,
        for (dst = list_head(&sn.dstg->dstg_tasks); dst;
            dst = list_next(&sn.dstg->dstg_tasks, dst)) {
                objset_t *os = dst->dst_arg1;
-               dsl_dataset_t *ds = os->os->os_dsl_dataset;
+               dsl_dataset_t *ds = os->os_dsl_dataset;
                if (dst->dst_err)
                        dsl_dataset_name(ds, sn.failed);
                zil_resume(dmu_objset_zil(os));
-               dmu_objset_close(os);
+               dmu_objset_rele(os, &sn);
        }
 
        if (err)
@@ -888,11 +949,10 @@ dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
        blkptr_t *bp = zio->io_bp;
-       blkptr_t *bp_orig = &zio->io_bp_orig;
-       objset_impl_t *os = arg;
+       objset_t *os = arg;
        dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
        ASSERT(bp == os->os_rootbp);
@@ -908,24 +968,34 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
        bp->blk_fill = 0;
        for (int i = 0; i < dnp->dn_nblkptr; i++)
                bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+       blkptr_t *bp = zio->io_bp;
+       blkptr_t *bp_orig = &zio->io_bp_orig;
+       objset_t *os = arg;
 
        if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-               ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+               ASSERT(BP_EQUAL(bp, bp_orig));
        } else {
-               if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-                       (void) dsl_dataset_block_kill(os->os_dsl_dataset,
-                           &zio->io_bp_orig, zio, os->os_synctx);
-               dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+               dsl_dataset_t *ds = os->os_dsl_dataset;
+               dmu_tx_t *tx = os->os_synctx;
+
+               (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+               dsl_dataset_block_born(ds, bp, tx);
        }
 }
 
 /* called from dsl */
 void
-dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
        int txgoff;
        zbookmark_t zb;
-       writeprops_t wp = { 0 };
+       zio_prop_t zp;
        zio_t *zio;
        list_t *list;
        list_t *newlist = NULL;
@@ -949,26 +1019,17 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
        /*
         * Create the root block IO
         */
-       zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-       zb.zb_object = 0;
-       zb.zb_level = -1;       /* for block ordering; it's level 0 on disk */
-       zb.zb_blkid = 0;
-
-       wp.wp_type = DMU_OT_OBJSET;
-       wp.wp_level = 0;        /* on-disk BP level; see above */
-       wp.wp_copies = os->os_copies;
-       wp.wp_oschecksum = os->os_checksum;
-       wp.wp_oscompress = os->os_compress;
-
-       if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
-               (void) dsl_dataset_block_kill(os->os_dsl_dataset,
-                   os->os_rootbp, pio, tx);
-       }
+       SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+           os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+       VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf,
+           os->os_rootbp, os->os_spa, &zb));
 
-       arc_release(os->os_phys_buf, &os->os_phys_buf);
+       dmu_write_policy(os, NULL, 0, 0, &zp);
 
-       zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
-           tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+       zio = arc_write(pio, os->os_spa, tx->tx_txg,
+           os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
+           dmu_objset_write_ready, dmu_objset_write_done, os,
            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
        /*
@@ -1017,7 +1078,14 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
        zio_nowait(zio);
 }
 
-static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+       return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
+           !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
+}
+
+objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1026,74 +1094,84 @@ dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
 }
 
 boolean_t
-dmu_objset_userused_enabled(objset_impl_t *os)
+dmu_objset_userused_enabled(objset_t *os)
 {
        return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
            used_cbs[os->os_phys->os_type] &&
            os->os_userused_dnode);
 }
 
+static void
+do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
+    uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+{
+       if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
+               int64_t delta = DNODE_SIZE + used;
+               if (subtract)
+                       delta = -delta;
+               VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
+                   user, delta, tx));
+               VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+                   group, delta, tx));
+       }
+}
+
 void
-dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
+dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
        dnode_t *dn;
        list_t *list = &os->os_synced_dnodes;
-       static const char zerobuf[DN_MAX_BONUSLEN] = {0};
 
        ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
        while (dn = list_head(list)) {
-               dmu_object_type_t bonustype;
-
                ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
-               ASSERT(dn->dn_oldphys);
                ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
                    dn->dn_phys->dn_flags &
                    DNODE_FLAG_USERUSED_ACCOUNTED);
 
                /* Allocate the user/groupused objects if necessary. */
                if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
-                       VERIFY(0 == zap_create_claim(&os->os,
+                       VERIFY(0 == zap_create_claim(os,
                            DMU_USERUSED_OBJECT,
                            DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-                       VERIFY(0 == zap_create_claim(&os->os,
+                       VERIFY(0 == zap_create_claim(os,
                            DMU_GROUPUSED_OBJECT,
                            DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
                }
 
                /*
-                * If the object was not previously
-                * accounted, pretend that it was free.
+                * We intentionally modify the zap object even if the
+                * net delta is zero.  Otherwise
+                * the block of the zap obj could be shared between
+                * datasets but need to be different between them after
+                * a bprewrite.
                 */
-               if (!(dn->dn_oldphys->dn_flags &
-                   DNODE_FLAG_USERUSED_ACCOUNTED)) {
-                       bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
-               }
-
-               /*
-                * If the object was freed, use the previous bonustype.
-                */
-               bonustype = dn->dn_phys->dn_bonustype ?
-                   dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
-               ASSERT(dn->dn_phys->dn_type != 0 ||
-                   (bcmp(DN_BONUS(dn->dn_phys), zerobuf,
-                   DN_MAX_BONUSLEN) == 0 &&
-                   DN_USED_BYTES(dn->dn_phys) == 0));
-               ASSERT(dn->dn_oldphys->dn_type != 0 ||
-                   (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
-                   DN_MAX_BONUSLEN) == 0 &&
-                   DN_USED_BYTES(dn->dn_oldphys) == 0));
-               used_cbs[os->os_phys->os_type](&os->os, bonustype,
-                   DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
-                   DN_USED_BYTES(dn->dn_oldphys),
-                   DN_USED_BYTES(dn->dn_phys), tx);
 
-               /*
-                * The mutex is needed here for interlock with dnode_allocate.
-                */
                mutex_enter(&dn->dn_mtx);
-               zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
-               dn->dn_oldphys = NULL;
+               ASSERT(dn->dn_id_flags);
+               if (dn->dn_id_flags & DN_ID_OLD_EXIST)  {
+                       do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
+                           dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+               }
+               if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+                       do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+                           dn->dn_phys->dn_flags,  dn->dn_newuid,
+                           dn->dn_newgid, B_FALSE, tx);
+               }
+
+               dn->dn_oldused = 0;
+               dn->dn_oldflags = 0;
+               if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+                       dn->dn_olduid = dn->dn_newuid;
+                       dn->dn_oldgid = dn->dn_newgid;
+                       dn->dn_id_flags |= DN_ID_OLD_EXIST;
+                       if (dn->dn_bonuslen == 0)
+                               dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+                       else
+                               dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+               }
+               dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
                mutex_exit(&dn->dn_mtx);
 
                list_remove(list, dn);
@@ -1101,10 +1179,140 @@ dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
        }
 }
 
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned.  In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+       dbuf_dirty_record_t *dr, **drp;
+       void *data;
+
+       if (db->db_dirtycnt == 0)
+               return (db->db.db_data);  /* Nothing is changing */
+
+       for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+               if (dr->dr_txg == tx->tx_txg)
+                       break;
+
+       if (dr == NULL)
+               data = NULL;
+       else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 &&
+           dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+               data = dr->dt.dl.dr_data->b_data;
+       else
+               data = dr->dt.dl.dr_data;
+       return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+       objset_t *os = dn->dn_objset;
+       void *data = NULL;
+       dmu_buf_impl_t *db = NULL;
+       uint64_t *user, *group;
+       int flags = dn->dn_id_flags;
+       int error;
+       boolean_t have_spill = B_FALSE;
+
+       if (!dmu_objset_userused_enabled(dn->dn_objset))
+               return;
+
+       if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+           DN_ID_CHKED_SPILL)))
+               return;
+
+       if (before && dn->dn_bonuslen != 0)
+               data = DN_BONUS(dn->dn_phys);
+       else if (!before && dn->dn_bonuslen != 0) {
+               if (dn->dn_bonus) {
+                       db = dn->dn_bonus;
+                       mutex_enter(&db->db_mtx);
+                       data = dmu_objset_userquota_find_data(db, tx);
+               } else {
+                       data = DN_BONUS(dn->dn_phys);
+               }
+       } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+                       int rf = 0;
+
+                       if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+                               rf |= DB_RF_HAVESTRUCT;
+                       error = dmu_spill_hold_by_dnode(dn, rf,
+                           FTAG, (dmu_buf_t **)&db);
+                       ASSERT(error == 0);
+                       mutex_enter(&db->db_mtx);
+                       data = (before) ? db->db.db_data :
+                           dmu_objset_userquota_find_data(db, tx);
+                       have_spill = B_TRUE;
+       } else {
+               mutex_enter(&dn->dn_mtx);
+               dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+               mutex_exit(&dn->dn_mtx);
+               return;
+       }
+
+       if (before) {
+               ASSERT(data);
+               user = &dn->dn_olduid;
+               group = &dn->dn_oldgid;
+       } else if (data) {
+               user = &dn->dn_newuid;
+               group = &dn->dn_newgid;
+       }
+
+       /*
+        * Must always call the callback in case the object
+        * type has changed and that type isn't an object type to track
+        */
+       error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
+           user, group);
+
+       /*
+        * Preserve existing uid/gid when the callback can't determine
+        * what the new uid/gid are and the callback returned EEXIST.
+        * The EEXIST error tells us to just use the existing uid/gid.
+        * If we don't know what the old values are then just assign
+        * them to 0, since that is a new file  being created.
+        */
+       if (!before && data == NULL && error == EEXIST) {
+               if (flags & DN_ID_OLD_EXIST) {
+                       dn->dn_newuid = dn->dn_olduid;
+                       dn->dn_newgid = dn->dn_oldgid;
+               } else {
+                       dn->dn_newuid = 0;
+                       dn->dn_newgid = 0;
+               }
+               error = 0;
+       }
+
+       if (db)
+               mutex_exit(&db->db_mtx);
+
+       mutex_enter(&dn->dn_mtx);
+       if (error == 0 && before)
+               dn->dn_id_flags |= DN_ID_OLD_EXIST;
+       if (error == 0 && !before)
+               dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+       if (have_spill) {
+               dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+       } else {
+               dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+       }
+       mutex_exit(&dn->dn_mtx);
+       if (have_spill)
+               dmu_buf_rele((dmu_buf_t *)db, FTAG);
+}
+
 boolean_t
 dmu_objset_userspace_present(objset_t *os)
 {
-       return (os->os->os_phys->os_flags &
+       return (os->os_phys->os_flags &
            OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 }
 
@@ -1116,7 +1324,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
 
        if (dmu_objset_userspace_present(os))
                return (0);
-       if (!dmu_objset_userused_enabled(os->os))
+       if (!dmu_objset_userused_enabled(os))
                return (ENOTSUP);
        if (dmu_objset_is_snapshot(os))
                return (EINVAL);
@@ -1152,7 +1360,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
                dmu_tx_commit(tx);
        }
 
-       os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+       os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
        txg_wait_synced(dmu_objset_pool(os), 0);
        return (0);
 }
@@ -1161,35 +1369,35 @@ void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-       dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+       dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
            usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
-       return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+       return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
-       stat->dds_type = os->os->os_phys->os_type;
-       if (os->os->os_dsl_dataset)
-               dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+       stat->dds_type = os->os_phys->os_type;
+       if (os->os_dsl_dataset)
+               dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
-       ASSERT(os->os->os_dsl_dataset ||
-           os->os->os_phys->os_type == DMU_OST_META);
+       ASSERT(os->os_dsl_dataset ||
+           os->os_phys->os_type == DMU_OST_META);
 
-       if (os->os->os_dsl_dataset != NULL)
-               dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+       if (os->os_dsl_dataset != NULL)
+               dsl_dataset_stats(os->os_dsl_dataset, nv);
 
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
-           os->os->os_phys->os_type);
+           os->os_phys->os_type);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
            dmu_objset_userspace_present(os));
 }
@@ -1197,8 +1405,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv)
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
-       if (os->os->os_dsl_dataset != NULL)
-               return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+       if (os->os_dsl_dataset != NULL)
+               return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
        else
                return (B_FALSE);
 }
@@ -1207,7 +1415,7 @@ int
 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
-       dsl_dataset_t *ds = os->os->os_dsl_dataset;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
        uint64_t ignored;
 
        if (ds->ds_phys->ds_snapnames_zapobj == 0)
@@ -1222,7 +1430,7 @@ int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
-       dsl_dataset_t *ds = os->os->os_dsl_dataset;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
        zap_cursor_t cursor;
        zap_attribute_t attr;
 
@@ -1259,12 +1467,12 @@ int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
-       dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+       dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
        zap_cursor_t cursor;
        zap_attribute_t attr;
 
        /* there is no next dir on a snapshot! */
-       if (os->os->os_dsl_dataset->ds_object !=
+       if (os->os_dsl_dataset->ds_object !=
            dd->dd_phys->dd_head_dataset_obj)
                return (ENOENT);
 
@@ -1293,7 +1501,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
 }
 
 struct findarg {
-       int (*func)(char *, void *);
+       int (*func)(const char *, void *);
        void *arg;
 };
 
@@ -1302,7 +1510,7 @@ static int
 findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 {
        struct findarg *fa = arg;
-       return (fa->func((char *)dsname, fa->arg));
+       return (fa->func(dsname, fa->arg));
 }
 
 /*
@@ -1310,7 +1518,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
  * Perhaps change all callers to use dmu_objset_find_spa()?
  */
 int
-dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+    int flags)
 {
        struct findarg fa;
        fa.func = func;
@@ -1361,12 +1570,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
                        ASSERT(attr->za_integer_length == sizeof (uint64_t));
                        ASSERT(attr->za_num_integers == 1);
 
-                       child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-                       (void) strcpy(child, name);
-                       (void) strcat(child, "/");
-                       (void) strcat(child, attr->za_name);
+                       child = kmem_asprintf("%s/%s", name, attr->za_name);
                        err = dmu_objset_find_spa(spa, child, func, arg, flags);
-                       kmem_free(child, MAXPATHLEN);
+                       strfree(child);
                        if (err)
                                break;
                }
@@ -1400,13 +1606,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
                                    sizeof (uint64_t));
                                ASSERT(attr->za_num_integers == 1);
 
-                               child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-                               (void) strcpy(child, name);
-                               (void) strcat(child, "@");
-                               (void) strcat(child, attr->za_name);
+                               child = kmem_asprintf("%s@%s",
+                                   name, attr->za_name);
                                err = func(spa, attr->za_first_integer,
                                    child, arg);
-                               kmem_free(child, MAXPATHLEN);
+                               strfree(child);
                                if (err)
                                        break;
                        }
@@ -1429,7 +1633,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
 
 /* ARGSUSED */
 int
-dmu_objset_prefetch(char *name, void *arg)
+dmu_objset_prefetch(const char *name, void *arg)
 {
        dsl_dataset_t *ds;
 
@@ -1438,16 +1642,14 @@ dmu_objset_prefetch(char *name, void *arg)
 
        if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
                mutex_enter(&ds->ds_opening_lock);
-               if (!dsl_dataset_get_user_ptr(ds)) {
+               if (ds->ds_objset == NULL) {
                        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                        zbookmark_t zb;
 
-                       zb.zb_objset = ds->ds_object;
-                       zb.zb_object = 0;
-                       zb.zb_level = -1;
-                       zb.zb_blkid = 0;
+                       SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+                           ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
-                       (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
+                       (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds),
                            &ds->ds_phys->ds_bp, NULL, NULL,
                            ZIO_PRIORITY_ASYNC_READ,
                            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
@@ -1463,13 +1665,13 @@ dmu_objset_prefetch(char *name, void *arg)
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
-       ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
-       os->os->os_user_ptr = user_ptr;
+       ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+       os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
-       ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
-       return (os->os->os_user_ptr);
+       ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+       return (os->os_user_ptr);
 }
index ce59aac..6b00b73 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another.  Multiple calls to dump_free() and
+ * dump_freeobjects() can be aggregated into a single DRR_FREE or
+ * DRR_FREEOBJECTS replay record.
+ */
+typedef enum {
+       PENDING_NONE,
+       PENDING_FREE,
+       PENDING_FREEOBJECTS
+} pendop_t;
+
 struct backuparg {
        dmu_replay_record_t *drr;
        vnode_t *vp;
        offset_t *off;
        objset_t *os;
        zio_cksum_t zc;
+       uint64_t toguid;
        int err;
+       pendop_t pending_op;
 };
 
 static int
@@ -68,29 +86,120 @@ static int
 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
     uint64_t length)
 {
-       /* write a FREE record */
+       struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
+
+       /*
+        * If there is a pending op, but it's not PENDING_FREE, push it out,
+        * since free block aggregation can only be done for blocks of the
+        * same type (i.e., DRR_FREE records can only be aggregated with
+        * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
+        * aggregated with other DRR_FREEOBJECTS records.
+        */
+       if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
+               if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+               ba->pending_op = PENDING_NONE;
+       }
+
+       if (ba->pending_op == PENDING_FREE) {
+               /*
+                * There should never be a PENDING_FREE if length is -1
+                * (because dump_dnode is the only place where this
+                * function is called with a -1, and only after flushing
+                * any pending record).
+                */
+               ASSERT(length != -1ULL);
+               /*
+                * Check to see whether this free block can be aggregated
+                * with pending one.
+                */
+               if (drrf->drr_object == object && drrf->drr_offset +
+                   drrf->drr_length == offset) {
+                       drrf->drr_length += length;
+                       return (0);
+               } else {
+                       /* not a continuation.  Push out pending record */
+                       if (dump_bytes(ba, ba->drr,
+                           sizeof (dmu_replay_record_t)) != 0)
+                               return (EINTR);
+                       ba->pending_op = PENDING_NONE;
+               }
+       }
+       /* create a FREE record and make it pending */
        bzero(ba->drr, sizeof (dmu_replay_record_t));
        ba->drr->drr_type = DRR_FREE;
-       ba->drr->drr_u.drr_free.drr_object = object;
-       ba->drr->drr_u.drr_free.drr_offset = offset;
-       ba->drr->drr_u.drr_free.drr_length = length;
+       drrf->drr_object = object;
+       drrf->drr_offset = offset;
+       drrf->drr_length = length;
+       drrf->drr_toguid = ba->toguid;
+       if (length == -1ULL) {
+               if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+       } else {
+               ba->pending_op = PENDING_FREE;
+       }
 
-       if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-               return (EINTR);
        return (0);
 }
 
 static int
 dump_data(struct backuparg *ba, dmu_object_type_t type,
-    uint64_t object, uint64_t offset, int blksz, void *data)
+    uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
+       struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
+
+
+       /*
+        * If there is any kind of pending aggregation (currently either
+        * a grouping of free objects or free blocks), push it out to
+        * the stream, since aggregation can't be done across operations
+        * of different types.
+        */
+       if (ba->pending_op != PENDING_NONE) {
+               if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+               ba->pending_op = PENDING_NONE;
+       }
        /* write a DATA record */
        bzero(ba->drr, sizeof (dmu_replay_record_t));
        ba->drr->drr_type = DRR_WRITE;
-       ba->drr->drr_u.drr_write.drr_object = object;
-       ba->drr->drr_u.drr_write.drr_type = type;
-       ba->drr->drr_u.drr_write.drr_offset = offset;
-       ba->drr->drr_u.drr_write.drr_length = blksz;
+       drrw->drr_object = object;
+       drrw->drr_type = type;
+       drrw->drr_offset = offset;
+       drrw->drr_length = blksz;
+       drrw->drr_toguid = ba->toguid;
+       drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+       if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+               drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+       DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+       DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+       DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+       drrw->drr_key.ddk_cksum = bp->blk_cksum;
+
+       if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+               return (EINTR);
+       if (dump_bytes(ba, data, blksz) != 0)
+               return (EINTR);
+       return (0);
+}
+
+static int
+dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data)
+{
+       struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill);
+
+       if (ba->pending_op != PENDING_NONE) {
+               if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+               ba->pending_op = PENDING_NONE;
+       }
+
+       /* write a SPILL record */
+       bzero(ba->drr, sizeof (dmu_replay_record_t));
+       ba->drr->drr_type = DRR_SPILL;
+       drrs->drr_object = object;
+       drrs->drr_length = blksz;
+       drrs->drr_toguid = ba->toguid;
 
        if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
                return (EINTR);
@@ -102,39 +211,80 @@ dump_data(struct backuparg *ba, dmu_object_type_t type,
 static int
 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
 {
+       struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
+
+       /*
+        * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+        * push it out, since free block aggregation can only be done for
+        * blocks of the same type (i.e., DRR_FREE records can only be
+        * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
+        * can only be aggregated with other DRR_FREEOBJECTS records.
+        */
+       if (ba->pending_op != PENDING_NONE &&
+           ba->pending_op != PENDING_FREEOBJECTS) {
+               if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+               ba->pending_op = PENDING_NONE;
+       }
+       if (ba->pending_op == PENDING_FREEOBJECTS) {
+               /*
+                * See whether this free object array can be aggregated
+                * with pending one
+                */
+               if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+                       drrfo->drr_numobjs += numobjs;
+                       return (0);
+               } else {
+                       /* can't be aggregated.  Push out pending record */
+                       if (dump_bytes(ba, ba->drr,
+                           sizeof (dmu_replay_record_t)) != 0)
+                               return (EINTR);
+                       ba->pending_op = PENDING_NONE;
+               }
+       }
+
        /* write a FREEOBJECTS record */
        bzero(ba->drr, sizeof (dmu_replay_record_t));
        ba->drr->drr_type = DRR_FREEOBJECTS;
-       ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
-       ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+       drrfo->drr_firstobj = firstobj;
+       drrfo->drr_numobjs = numobjs;
+       drrfo->drr_toguid = ba->toguid;
+
+       ba->pending_op = PENDING_FREEOBJECTS;
 
-       if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-               return (EINTR);
        return (0);
 }
 
 static int
 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
 {
+       struct drr_object *drro = &(ba->drr->drr_u.drr_object);
+
        if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
                return (dump_freeobjects(ba, object, 1));
 
+       if (ba->pending_op != PENDING_NONE) {
+               if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+               ba->pending_op = PENDING_NONE;
+       }
+
        /* write an OBJECT record */
        bzero(ba->drr, sizeof (dmu_replay_record_t));
        ba->drr->drr_type = DRR_OBJECT;
-       ba->drr->drr_u.drr_object.drr_object = object;
-       ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
-       ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
-       ba->drr->drr_u.drr_object.drr_blksz =
-           dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
-       ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
-       ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
-       ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
-       if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+       drro->drr_object = object;
+       drro->drr_type = dnp->dn_type;
+       drro->drr_bonustype = dnp->dn_bonustype;
+       drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+       drro->drr_bonuslen = dnp->dn_bonuslen;
+       drro->drr_checksumtype = dnp->dn_checksum;
+       drro->drr_compress = dnp->dn_compress;
+       drro->drr_toguid = ba->toguid;
+
+       if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
                return (EINTR);
 
-       if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+       if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
                return (EINTR);
 
        /* free anything past the end of the file */
@@ -150,9 +300,10 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
        (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
        (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 
+/* ARGSUSED */
 static int
-backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
        struct backuparg *ba = arg;
        dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
@@ -161,9 +312,10 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
        if (issig(JUSTLOOKING) && issig(FORREAL))
                return (EINTR);
 
-       if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+       if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+           DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
                return (0);
-       } else if (bp == NULL && zb->zb_object == 0) {
+       } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
                uint64_t span = BP_SPAN(dnp, zb->zb_level);
                uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
                err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
@@ -179,7 +331,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
                uint32_t aflags = ARC_WAIT;
                arc_buf_t *abuf;
 
-               if (arc_read_nolock(NULL, spa, bp,
+               if (dsl_read(NULL, spa, bp, pbuf,
                    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
                    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
                        return (EIO);
@@ -193,7 +345,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
                                break;
                }
                (void) arc_buf_remove_ref(abuf, &abuf);
-       } else { /* it's a level-0 block of a regular object */
+       } else if (type == DMU_OT_SA) {
                uint32_t aflags = ARC_WAIT;
                arc_buf_t *abuf;
                int blksz = BP_GET_LSIZE(bp);
@@ -203,8 +355,20 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
                    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
                        return (EIO);
 
+               err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data);
+               (void) arc_buf_remove_ref(abuf, &abuf);
+       } else { /* it's a level-0 block of a regular object */
+               uint32_t aflags = ARC_WAIT;
+               arc_buf_t *abuf;
+               int blksz = BP_GET_LSIZE(bp);
+
+               if (dsl_read(NULL, spa, bp, pbuf,
+                   arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+                   ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+                       return (EIO);
+
                err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
-                   blksz, abuf->b_data);
+                   blksz, bp, abuf->b_data);
                (void) arc_buf_remove_ref(abuf, &abuf);
        }
 
@@ -216,8 +380,8 @@ int
 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
     vnode_t *vp, offset_t *off)
 {
-       dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
-       dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+       dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+       dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
        dmu_replay_record_t *drr;
        struct backuparg ba;
        int err;
@@ -254,10 +418,25 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
        drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
        drr->drr_type = DRR_BEGIN;
        drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-       drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
+       DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
+           DMU_SUBSTREAM);
+
+#ifdef _KERNEL
+       if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
+               uint64_t version;
+               if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0)
+                       return (EINVAL);
+               if (version == ZPL_VERSION_SA) {
+                       DMU_SET_FEATUREFLAGS(
+                           drr->drr_u.drr_begin.drr_versioninfo,
+                           DMU_BACKUP_FEATURE_SA_SPILL);
+               }
+       }
+#endif
+
        drr->drr_u.drr_begin.drr_creation_time =
            ds->ds_phys->ds_creation_time;
-       drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+       drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
        if (fromorigin)
                drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
        drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
@@ -277,9 +456,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
        ba.vp = vp;
        ba.os = tosnap;
        ba.off = off;
+       ba.toguid = ds->ds_phys->ds_guid;
        ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
+       ba.pending_op = PENDING_NONE;
 
-       if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+       if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
                kmem_free(drr, sizeof (dmu_replay_record_t));
                return (ba.err);
        }
@@ -287,6 +468,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
        err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
            backup_cb, &ba);
 
+       if (ba.pending_op != PENDING_NONE)
+               if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
+                       err = EINTR;
+
        if (err) {
                if (err == EINTR && ba.err)
                        err = ba.err;
@@ -297,8 +482,9 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
        bzero(drr, sizeof (dmu_replay_record_t));
        drr->drr_type = DRR_END;
        drr->drr_u.drr_end.drr_checksum = ba.zc;
+       drr->drr_u.drr_end.drr_toguid = ba.toguid;
 
-       if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+       if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
                kmem_free(drr, sizeof (dmu_replay_record_t));
                return (ba.err);
        }
@@ -319,33 +505,12 @@ struct recvbeginsyncarg {
        uint64_t dsflags;
        char clonelastname[MAXNAMELEN];
        dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
+       cred_t *cr;
 };
 
-static dsl_dataset_t *
-recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
-    cred_t *cr, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds;
-
-       /* This should always work, since we just created it */
-       /* XXX - create should return an owned ds */
-       VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
-           DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
-
-       if (type != DMU_OST_NONE) {
-               (void) dmu_objset_create_impl(dp->dp_spa,
-                   ds, &ds->ds_phys->ds_bp, type, tx);
-       }
-
-       spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
-           dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
-
-       return (ds);
-}
-
 /* ARGSUSED */
 static int
-recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
        struct recvbeginsyncarg *rbsa = arg2;
@@ -363,7 +528,7 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
                /* make sure it's a snap in the same pool */
                if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
                        return (EXDEV);
-               if (rbsa->origin->ds_phys->ds_num_children == 0)
+               if (!dsl_dataset_is_snapshot(rbsa->origin))
                        return (EINVAL);
                if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
                        return (ENODEV);
@@ -373,82 +538,31 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
        struct recvbeginsyncarg *rbsa = arg2;
        uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
        uint64_t dsobj;
 
+       /* Create and open new dataset. */
        dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
-           rbsa->origin, flags, cr, tx);
-
-       rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
-           rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
-
-static int
-recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       struct recvbeginsyncarg *rbsa = arg2;
-       int err;
-       struct dsl_ds_destroyarg dsda = {0};
-
-       /* must be a head ds */
-       if (ds->ds_phys->ds_next_snap_obj != 0)
-               return (EINVAL);
-
-       /* must not be a clone ds */
-       if (dsl_dir_is_clone(ds->ds_dir))
-               return (EINVAL);
-
-       dsda.ds = ds;
-       err = dsl_dataset_destroy_check(&dsda, rbsa->tag, tx);
-       if (err)
-               return (err);
+           rbsa->origin, flags, rbsa->cr, tx);
+       VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
+           B_TRUE, dmu_recv_tag, &rbsa->ds));
 
-       if (rbsa->origin) {
-               /* make sure it's a snap in the same pool */
-               if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
-                       return (EXDEV);
-               if (rbsa->origin->ds_phys->ds_num_children == 0)
-                       return (EINVAL);
-               if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
-                       return (ENODEV);
+       if (rbsa->origin == NULL) {
+               (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
+                   rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
        }
 
-       return (0);
-}
-
-static void
-recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       struct recvbeginsyncarg *rbsa = arg2;
-       dsl_dir_t *dd = ds->ds_dir;
-       uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-       uint64_t dsobj;
-       struct dsl_ds_destroyarg dsda = {0};
-
-       /*
-        * NB: caller must provide an extra hold on the dsl_dir_t, so it
-        * won't go away when dsl_dataset_destroy_sync() closes the
-        * dataset.
-        */
-       dsda.ds = ds;
-       dsl_dataset_destroy_sync(&dsda, rbsa->tag, cr, tx);
-       ASSERT3P(dsda.rm_origin, ==, NULL);
-
-       dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
-
-       rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
-           rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+       spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
+           dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
 }
 
 /* ARGSUSED */
 static int
-recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
        struct recvbeginsyncarg *rbsa = arg2;
@@ -459,13 +573,43 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
                return (ETXTBSY);
 
-       /* must already be a snapshot of this fs */
-       if (ds->ds_phys->ds_prev_snap_obj == 0)
-               return (ENODEV);
+       if (rbsa->fromguid) {
+               /* if incremental, most recent snapshot must match fromguid */
+               if (ds->ds_prev == NULL)
+                       return (ENODEV);
 
-       /* most recent snapshot must match fromguid */
-       if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
-               return (ENODEV);
+               /*
+                * most recent snapshot must match fromguid, or there are no
+                * changes since the fromguid one
+                */
+               if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
+                       uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
+                       uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
+                       while (obj != 0) {
+                               dsl_dataset_t *snap;
+                               err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+                                   obj, FTAG, &snap);
+                               if (err)
+                                       return (ENODEV);
+                               if (snap->ds_phys->ds_creation_txg < birth) {
+                                       dsl_dataset_rele(snap, FTAG);
+                                       return (ENODEV);
+                               }
+                               if (snap->ds_phys->ds_guid == rbsa->fromguid) {
+                                       dsl_dataset_rele(snap, FTAG);
+                                       break; /* it's ok */
+                               }
+                               obj = snap->ds_phys->ds_prev_snap_obj;
+                               dsl_dataset_rele(snap, FTAG);
+                       }
+                       if (obj == 0)
+                               return (ENODEV);
+               }
+       } else {
+               /* if full, most recent snapshot must be $ORIGIN */
+               if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
+                       return (ENODEV);
+       }
 
        /* temporary clone name must not exist */
        err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
@@ -488,34 +632,46 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ohds = arg1;
        struct recvbeginsyncarg *rbsa = arg2;
        dsl_pool_t *dp = ohds->ds_dir->dd_pool;
-       dsl_dataset_t *ods, *cds;
+       dsl_dataset_t *cds;
        uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
        uint64_t dsobj;
 
-       /* create the temporary clone */
-       VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
-           FTAG, &ods));
-       dsobj = dsl_dataset_create_sync(ohds->ds_dir,
-           rbsa->clonelastname, ods, flags, cr, tx);
-       dsl_dataset_rele(ods, FTAG);
-
-       /* open the temporary clone */
-       VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
-           DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
+       /* create and open the temporary clone */
+       dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
+           ohds->ds_prev, flags, rbsa->cr, tx);
+       VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
 
-       /* copy the refquota from the target fs to the clone */
-       if (ohds->ds_quota > 0)
-               dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
+       /*
+        * If we actually created a non-clone, we need to create the
+        * objset in our new dataset.
+        */
+       if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+               (void) dmu_objset_create_impl(dp->dp_spa,
+                   cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+       }
 
        rbsa->ds = cds;
 
-       spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
-           dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+       spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
+           dp->dp_spa, tx, "dataset = %lld", dsobj);
+}
+
+
+static boolean_t
+dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
+{
+       int featureflags;
+
+       featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+
+       /* Verify pool version supports SA if SA_SPILL feature set */
+       return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+           (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
 }
 
 /*
@@ -523,13 +679,13 @@ recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
     boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
 {
        int err = 0;
        boolean_t byteswap;
-       struct recvbeginsyncarg rbsa;
-       uint64_t version;
+       struct recvbeginsyncarg rbsa = { 0 };
+       uint64_t versioninfo;
        int flags;
        dsl_dataset_t *ds;
 
@@ -542,22 +698,23 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
 
        rbsa.tofs = tofs;
        rbsa.tosnap = tosnap;
-       rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+       rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
        rbsa.fromguid = drrb->drr_fromguid;
        rbsa.type = drrb->drr_type;
        rbsa.tag = FTAG;
        rbsa.dsflags = 0;
-       version = drrb->drr_version;
+       rbsa.cr = CRED();
+       versioninfo = drrb->drr_versioninfo;
        flags = drrb->drr_flags;
 
        if (byteswap) {
                rbsa.type = BSWAP_32(rbsa.type);
                rbsa.fromguid = BSWAP_64(rbsa.fromguid);
-               version = BSWAP_64(version);
+               versioninfo = BSWAP_64(versioninfo);
                flags = BSWAP_32(flags);
        }
 
-       if (version != DMU_BACKUP_STREAM_VERSION ||
+       if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
            rbsa.type >= DMU_OST_NUMTYPES ||
            ((flags & DRR_FLAG_CLONE) && origin == NULL))
                return (EINVAL);
@@ -568,22 +725,27 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
        bzero(drc, sizeof (dmu_recv_cookie_t));
        drc->drc_drrb = drrb;
        drc->drc_tosnap = tosnap;
+       drc->drc_top_ds = top_ds;
        drc->drc_force = force;
 
        /*
         * Process the begin in syncing context.
         */
-       if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
-               /* incremental receive */
 
-               /* tmp clone name is: tofs/%tosnap" */
-               (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
-                   "%%%s", tosnap);
+       /* open the dataset we are logically receiving into */
+       err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+       if (err == 0) {
+               if (dmu_recv_verify_features(ds, drrb)) {
+                       dsl_dataset_rele(ds, dmu_recv_tag);
+                       return (ENOTSUP);
+               }
+               /* target fs already exists; recv into temp clone */
 
-               /* open the dataset we are logically receiving into */
-               err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
-               if (err)
-                       return (err);
+               /* Can't recv a clone into an existing fs */
+               if (flags & DRR_FLAG_CLONE) {
+                       dsl_dataset_rele(ds, dmu_recv_tag);
+                       return (EINVAL);
+               }
 
                /* must not have an incremental recv already in progress */
                if (!mutex_tryenter(&ds->ds_recvlock)) {
@@ -591,10 +753,12 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
                        return (EBUSY);
                }
 
+               /* tmp clone name is: tofs/%tosnap" */
+               (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
+                   "%%%s", tosnap);
                rbsa.force = force;
                err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   recv_incremental_check,
-                   recv_incremental_sync, ds, &rbsa, 5);
+                   recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
                if (err) {
                        mutex_exit(&ds->ds_recvlock);
                        dsl_dataset_rele(ds, dmu_recv_tag);
@@ -602,47 +766,40 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
                }
                drc->drc_logical_ds = ds;
                drc->drc_real_ds = rbsa.ds;
-       } else {
-               /* create new fs -- full backup or clone */
-               dsl_dir_t *dd = NULL;
-               const char *tail;
+       } else if (err == ENOENT) {
+               /* target fs does not exist; must be a full backup or clone */
+               char *cp;
 
-               err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+               /*
+                * If it's a non-clone incremental, we are missing the
+                * target fs, so fail the recv.
+                */
+               if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
+                       return (ENOENT);
+
+               /* Open the parent of tofs */
+               cp = strrchr(tofs, '/');
+               *cp = '\0';
+               err = dsl_dataset_hold(tofs, FTAG, &ds);
+               *cp = '/';
                if (err)
                        return (err);
-               if (tail == NULL) {
-                       if (!force) {
-                               dsl_dir_close(dd, FTAG);
-                               return (EEXIST);
-                       }
 
-                       rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-                       err = dsl_dataset_own_obj(dd->dd_pool,
-                           dd->dd_phys->dd_head_dataset_obj,
-                           DS_MODE_INCONSISTENT, FTAG, &ds);
-                       rw_exit(&dd->dd_pool->dp_config_rwlock);
-                       if (err) {
-                               dsl_dir_close(dd, FTAG);
-                               return (err);
-                       }
-
-                       dsl_dataset_make_exclusive(ds, FTAG);
-                       err = dsl_sync_task_do(dd->dd_pool,
-                           recv_full_existing_check,
-                           recv_full_existing_sync, ds, &rbsa, 5);
-                       dsl_dataset_disown(ds, FTAG);
-               } else {
-                       err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
-                           recv_full_sync, dd, &rbsa, 5);
+               if (dmu_recv_verify_features(ds, drrb)) {
+                       dsl_dataset_rele(ds, dmu_recv_tag);
+                       return (ENOTSUP);
                }
-               dsl_dir_close(dd, FTAG);
+
+               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+                   recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
+               dsl_dataset_rele(ds, FTAG);
                if (err)
                        return (err);
                drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
                drc->drc_newfs = B_TRUE;
        }
 
-       return (0);
+       return (err);
 }
 
 struct restorearg {
@@ -653,8 +810,83 @@ struct restorearg {
        uint64_t voff;
        int bufsize; /* amount of memory allocated for buf */
        zio_cksum_t cksum;
+       avl_tree_t guid_to_ds_map;
 };
 
+typedef struct guid_map_entry {
+       uint64_t        guid;
+       dsl_dataset_t   *gme_ds;
+       avl_node_t      avlnode;
+} guid_map_entry_t;
+
+static int
+guid_compare(const void *arg1, const void *arg2)
+{
+       const guid_map_entry_t *gmep1 = arg1;
+       const guid_map_entry_t *gmep2 = arg2;
+
+       if (gmep1->guid < gmep2->guid)
+               return (-1);
+       else if (gmep1->guid > gmep2->guid)
+               return (1);
+       return (0);
+}
+
+/*
+ * This function is a callback used by dmu_objset_find() (which
+ * enumerates the object sets) to build an avl tree that maps guids
+ * to datasets.  The resulting table is used when processing DRR_WRITE_BYREF
+ * send stream records.  These records, which are used in dedup'ed
+ * streams, do not contain data themselves, but refer to a copy
+ * of the data block that has already been written because it was
+ * earlier in the stream.  That previous copy is identified by the
+ * guid of the dataset with the referenced data.
+ */
+int
+find_ds_by_guid(const char *name, void *arg)
+{
+       avl_tree_t *guid_map = arg;
+       dsl_dataset_t *ds, *snapds;
+       guid_map_entry_t *gmep;
+       dsl_pool_t *dp;
+       int err;
+       uint64_t lastobj, firstobj;
+
+       if (dsl_dataset_hold(name, FTAG, &ds) != 0)
+               return (0);
+
+       dp = ds->ds_dir->dd_pool;
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
+       lastobj = ds->ds_phys->ds_prev_snap_obj;
+
+       while (lastobj != firstobj) {
+               err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
+               if (err) {
+                       /*
+                        * Skip this snapshot and move on. It's not
+                        * clear why this would ever happen, but the
+                        * remainder of the snapshot streadm can be
+                        * processed.
+                        */
+                       rw_exit(&dp->dp_config_rwlock);
+                       dsl_dataset_rele(ds, FTAG);
+                       return (0);
+               }
+
+               gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
+               gmep->guid = snapds->ds_phys->ds_guid;
+               gmep->gme_ds = snapds;
+               avl_add(guid_map, gmep);
+               lastobj = snapds->ds_phys->ds_prev_snap_obj;
+       }
+
+       rw_exit(&dp->dp_config_rwlock);
+       dsl_dataset_rele(ds, FTAG);
+
+       return (0);
+}
+
 static void *
 restore_read(struct restorearg *ra, int len)
 {
@@ -699,7 +931,7 @@ backup_byteswap(dmu_replay_record_t *drr)
        switch (drr->drr_type) {
        case DRR_BEGIN:
                DO64(drr_begin.drr_magic);
-               DO64(drr_begin.drr_version);
+               DO64(drr_begin.drr_versioninfo);
                DO64(drr_begin.drr_creation_time);
                DO32(drr_begin.drr_type);
                DO32(drr_begin.drr_flags);
@@ -713,27 +945,56 @@ backup_byteswap(dmu_replay_record_t *drr)
                DO32(drr_object.drr_bonustype);
                DO32(drr_object.drr_blksz);
                DO32(drr_object.drr_bonuslen);
+               DO64(drr_object.drr_toguid);
                break;
        case DRR_FREEOBJECTS:
                DO64(drr_freeobjects.drr_firstobj);
                DO64(drr_freeobjects.drr_numobjs);
+               DO64(drr_freeobjects.drr_toguid);
                break;
        case DRR_WRITE:
                DO64(drr_write.drr_object);
                DO32(drr_write.drr_type);
                DO64(drr_write.drr_offset);
                DO64(drr_write.drr_length);
+               DO64(drr_write.drr_toguid);
+               DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
+               DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
+               DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
+               DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+               DO64(drr_write.drr_key.ddk_prop);
+               break;
+       case DRR_WRITE_BYREF:
+               DO64(drr_write_byref.drr_object);
+               DO64(drr_write_byref.drr_offset);
+               DO64(drr_write_byref.drr_length);
+               DO64(drr_write_byref.drr_toguid);
+               DO64(drr_write_byref.drr_refguid);
+               DO64(drr_write_byref.drr_refobject);
+               DO64(drr_write_byref.drr_refoffset);
+               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
+               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
+               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
+               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+               DO64(drr_write_byref.drr_key.ddk_prop);
                break;
        case DRR_FREE:
                DO64(drr_free.drr_object);
                DO64(drr_free.drr_offset);
                DO64(drr_free.drr_length);
+               DO64(drr_free.drr_toguid);
+               break;
+       case DRR_SPILL:
+               DO64(drr_spill.drr_object);
+               DO64(drr_spill.drr_length);
+               DO64(drr_spill.drr_toguid);
                break;
        case DRR_END:
                DO64(drr_end.drr_checksum.zc_word[0]);
                DO64(drr_end.drr_checksum.zc_word[1]);
                DO64(drr_end.drr_checksum.zc_word[2]);
                DO64(drr_end.drr_checksum.zc_word[3]);
+               DO64(drr_end.drr_toguid);
                break;
        }
 #undef DO64
@@ -750,7 +1011,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
        if (drro->drr_type == DMU_OT_NONE ||
            drro->drr_type >= DMU_OT_NUMTYPES ||
            drro->drr_bonustype >= DMU_OT_NUMTYPES ||
-           drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+           drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
            drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
            P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
            drro->drr_blksz < SPA_MINBLOCKSIZE ||
@@ -789,8 +1050,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
                    drro->drr_type, drro->drr_blksz,
                    drro->drr_bonustype, drro->drr_bonuslen);
        }
-       if (err)
+       if (err) {
                return (EINVAL);
+       }
 
        tx = dmu_tx_create(os);
        dmu_tx_hold_bonus(tx, drro->drr_object);
@@ -800,7 +1062,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
                return (err);
        }
 
-       dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+       dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
+           tx);
        dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
 
        if (data != NULL) {
@@ -882,6 +1145,114 @@ restore_write(struct restorearg *ra, objset_t *os,
        return (0);
 }
 
+/*
+ * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream.  This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+restore_write_byref(struct restorearg *ra, objset_t *os,
+    struct drr_write_byref *drrwbr)
+{
+       dmu_tx_t *tx;
+       int err;
+       guid_map_entry_t gmesrch;
+       guid_map_entry_t *gmep;
+       avl_index_t     where;
+       objset_t *ref_os = NULL;
+       dmu_buf_t *dbp;
+
+       if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+               return (EINVAL);
+
+       /*
+        * If the GUID of the referenced dataset is different from the
+        * GUID of the target dataset, find the referenced dataset.
+        */
+       if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+               gmesrch.guid = drrwbr->drr_refguid;
+               if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
+                   &where)) == NULL) {
+                       return (EINVAL);
+               }
+               if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+                       return (EINVAL);
+       } else {
+               ref_os = os;
+       }
+
+       if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+           drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+               return (err);
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_write(tx, drrwbr->drr_object,
+           drrwbr->drr_offset, drrwbr->drr_length);
+       err = dmu_tx_assign(tx, TXG_WAIT);
+       if (err) {
+               dmu_tx_abort(tx);
+               return (err);
+       }
+       dmu_write(os, drrwbr->drr_object,
+           drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+       dmu_buf_rele(dbp, FTAG);
+       dmu_tx_commit(tx);
+       return (0);
+}
+
+static int
+restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+{
+       dmu_tx_t *tx;
+       void *data;
+       dmu_buf_t *db, *db_spill;
+       int err;
+
+       if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+           drrs->drr_length > SPA_MAXBLOCKSIZE)
+               return (EINVAL);
+
+       data = restore_read(ra, drrs->drr_length);
+       if (data == NULL)
+               return (ra->err);
+
+       if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+               return (EINVAL);
+
+       VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+       if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+               dmu_buf_rele(db, FTAG);
+               return (err);
+       }
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_spill(tx, db->db_object);
+
+       err = dmu_tx_assign(tx, TXG_WAIT);
+       if (err) {
+               dmu_buf_rele(db, FTAG);
+               dmu_buf_rele(db_spill, FTAG);
+               dmu_tx_abort(tx);
+               return (err);
+       }
+       dmu_buf_will_dirty(db_spill, tx);
+
+       if (db_spill->db_size < drrs->drr_length)
+               VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+                   drrs->drr_length, tx));
+       bcopy(data, db_spill->db_data, drrs->drr_length);
+
+       dmu_buf_rele(db, FTAG);
+       dmu_buf_rele(db_spill, FTAG);
+
+       dmu_tx_commit(tx);
+       return (0);
+}
+
 /* ARGSUSED */
 static int
 restore_free(struct restorearg *ra, objset_t *os,
@@ -911,6 +1282,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        dmu_replay_record_t *drr;
        objset_t *os;
        zio_cksum_t pcksum;
+       guid_map_entry_t *gmep;
+       int featureflags;
 
        if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
                ra.byteswap = TRUE;
@@ -935,7 +1308,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        if (ra.byteswap) {
                struct drr_begin *drrb = drc->drc_drrb;
                drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-               drrb->drr_version = BSWAP_64(drrb->drr_version);
+               drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
                drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
                drrb->drr_type = BSWAP_32(drrb->drr_type);
                drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
@@ -948,16 +1321,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
-       ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+       ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+           DMU_SUBSTREAM);
        ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
 
        /*
         * Open the objset we are modifying.
         */
-       VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
+       VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
 
        ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
 
+       featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+       /* if this stream is dedup'ed, set up the avl tree for guid mapping */
+       if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+               avl_create(&ra.guid_to_ds_map, guid_compare,
+                   sizeof (guid_map_entry_t),
+                   offsetof(guid_map_entry_t, avlnode));
+               (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
+                   (void *)&ra.guid_to_ds_map,
+                   DS_FIND_CHILDREN);
+       }
+
        /*
         * Read records and process them.
         */
@@ -997,6 +1383,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
                        ra.err = restore_write(&ra, os, &drrw);
                        break;
                }
+               case DRR_WRITE_BYREF:
+               {
+                       struct drr_write_byref drrwbr =
+                           drr->drr_u.drr_write_byref;
+                       ra.err = restore_write_byref(&ra, os, &drrwbr);
+                       break;
+               }
                case DRR_FREE:
                {
                        struct drr_free drrf = drr->drr_u.drr_free;
@@ -1015,6 +1408,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
                                ra.err = ECKSUM;
                        goto out;
                }
+               case DRR_SPILL:
+               {
+                       struct drr_spill drrs = drr->drr_u.drr_spill;
+                       ra.err = restore_spill(&ra, os, &drrs);
+                       break;
+               }
                default:
                        ra.err = EINVAL;
                        goto out;
@@ -1024,8 +1423,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        ASSERT(ra.err != 0);
 
 out:
-       dmu_objset_close(os);
-
        if (ra.err != 0) {
                /*
                 * destroy what we created, so we don't leave it in the
@@ -1041,6 +1438,16 @@ out:
                }
        }
 
+       if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+               void *cookie = NULL;
+
+               while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
+                       dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
+                       kmem_free(gmep, sizeof (guid_map_entry_t));
+               }
+               avl_destroy(&ra.guid_to_ds_map);
+       }
+
        kmem_free(ra.buf, ra.bufsize);
        *voffp = ra.voff;
        return (ra.err);
@@ -1062,12 +1469,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
        struct recvendsyncarg *resa = arg2;
 
-       dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+       dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
 
        /* set snapshot's creation time and guid */
        dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
@@ -1079,37 +1486,31 @@ recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
 }
 
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
        struct recvendsyncarg resa;
        dsl_dataset_t *ds = drc->drc_logical_ds;
        int err;
 
        /*
-        * XXX hack; seems the ds is still dirty and
-        * dsl_pool_zil_clean() expects it to have a ds_user_ptr
-        * (and zil), but clone_swap() can close it.
+        * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+        * expects it to have a ds_user_ptr (and zil), but clone_swap()
+        * can close it.
         */
        txg_wait_synced(ds->ds_dir->dd_pool, 0);
 
-       if (ds != drc->drc_real_ds) {
-               /* we are doing an online recv */
-               if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
-                       err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
-                           drc->drc_force);
-                       if (err)
-                               dsl_dataset_disown(ds, dmu_recv_tag);
-               } else {
-                       err = EBUSY;
-                       dsl_dataset_rele(ds, dmu_recv_tag);
-               }
-               /* dsl_dataset_destroy() will disown the ds */
+       if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+               err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+                   drc->drc_force);
+               if (err)
+                       goto out;
+       } else {
+               mutex_exit(&ds->ds_recvlock);
+               dsl_dataset_rele(ds, dmu_recv_tag);
                (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
                    B_FALSE);
-               mutex_exit(&drc->drc_logical_ds->ds_recvlock);
-               if (err)
-                       return (err);
+               return (EBUSY);
        }
 
        resa.creation_time = drc->drc_drrb->drr_creation_time;
@@ -1119,17 +1520,52 @@ dmu_recv_end(dmu_recv_cookie_t *drc)
        err = dsl_sync_task_do(ds->ds_dir->dd_pool,
            recv_end_check, recv_end_sync, ds, &resa, 3);
        if (err) {
-               if (drc->drc_newfs) {
-                       ASSERT(ds == drc->drc_real_ds);
-                       (void) dsl_dataset_destroy(ds, dmu_recv_tag,
-                           B_FALSE);
-                       return (err);
-               } else {
-                       (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
-               }
+               /* swap back */
+               (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
        }
 
-       /* release the hold from dmu_recv_begin */
+out:
+       mutex_exit(&ds->ds_recvlock);
        dsl_dataset_disown(ds, dmu_recv_tag);
+       (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
        return (err);
 }
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+       struct recvendsyncarg resa;
+       dsl_dataset_t *ds = drc->drc_logical_ds;
+       int err;
+
+       /*
+        * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+        * expects it to have a ds_user_ptr (and zil), but clone_swap()
+        * can close it.
+        */
+       txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+       resa.creation_time = drc->drc_drrb->drr_creation_time;
+       resa.toguid = drc->drc_drrb->drr_toguid;
+       resa.tosnap = drc->drc_tosnap;
+
+       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+           recv_end_check, recv_end_sync, ds, &resa, 3);
+       if (err) {
+               /* clean up the fs we just recv'd into */
+               (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
+       } else {
+               /* release the hold from dmu_recv_begin */
+               dsl_dataset_disown(ds, dmu_recv_tag);
+       }
+       return (err);
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+       if (drc->drc_logical_ds != drc->drc_real_ds)
+               return (dmu_recv_existing_end(drc));
+       else
+               return (dmu_recv_new_end(drc));
+}
index 89cbfad..429c76a 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/callb.h>
 
-#define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-       (zb)->zb_objset = objset;                       \
-       (zb)->zb_object = object;                       \
-       (zb)->zb_level = level;                         \
-       (zb)->zb_blkid = blkid;                         \
-}
-
 struct prefetch_data {
        kmutex_t pd_mtx;
        kcondvar_t pd_cv;
@@ -68,27 +61,28 @@ static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
     arc_buf_t *buf, uint64_t objset, uint64_t object);
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
        struct traverse_data *td = arg;
        zbookmark_t zb;
 
        if (bp->blk_birth == 0)
-               return;
+               return (0);
 
        if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-               return;
+               return (0);
+
+       SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+           bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+       (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
 
-       zb.zb_objset = td->td_objset;
-       zb.zb_object = 0;
-       zb.zb_level = -1;
-       zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-       VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+       return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
        struct traverse_data *td = arg;
@@ -99,17 +93,18 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
                zbookmark_t zb;
 
                if (bp->blk_birth == 0)
-                       return;
+                       return (0);
 
                if (claim_txg == 0 || bp->blk_birth < claim_txg)
-                       return;
+                       return (0);
 
-               zb.zb_objset = td->td_objset;
-               zb.zb_object = lr->lr_foid;
-               zb.zb_level = BP_GET_LEVEL(bp);
-               zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-               VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+               SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
+                   lr->lr_offset / BP_GET_LSIZE(bp));
+
+               (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
+                   td->td_arg);
        }
+       return (0);
 }
 
 static void
@@ -120,7 +115,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
 
        /*
         * We only want to visit blocks that have been claimed but not yet
-        * replayed (or, in read-only mode, blocks that *would* be claimed).
+        * replayed; plus, in read-only mode, blocks that are already stable.
         */
        if (claim_txg == 0 && spa_writeable(td->td_spa))
                return;
@@ -138,12 +133,14 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 {
        zbookmark_t czb;
-       int err = 0;
+       int err = 0, lasterr = 0;
        arc_buf_t *buf = NULL;
        struct prefetch_data *pd = td->td_pfd;
+       boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
        if (bp->blk_birth == 0) {
-               err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+               err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
+                   td->td_arg);
                return (err);
        }
 
@@ -163,7 +160,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
        }
 
        if (td->td_flags & TRAVERSE_PRE) {
-               err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+               err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+                   td->td_arg);
                if (err)
                        return (err);
        }
@@ -174,7 +172,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
                blkptr_t *cbp;
                int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 
-               err = arc_read(NULL, td->td_spa, bp, pbuf,
+               err = dsl_read(NULL, td->td_spa, bp, pbuf,
                    arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                if (err)
@@ -187,15 +185,18 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
                            zb->zb_level - 1,
                            zb->zb_blkid * epb + i);
                        err = traverse_visitbp(td, dnp, buf, cbp, &czb);
-                       if (err)
-                               break;
+                       if (err) {
+                               if (!hard)
+                                       break;
+                               lasterr = err;
+                       }
                }
        } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
                uint32_t flags = ARC_WAIT;
                int i;
                int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
-               err = arc_read(NULL, td->td_spa, bp, pbuf,
+               err = dsl_read(NULL, td->td_spa, bp, pbuf,
                    arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                if (err)
@@ -203,18 +204,21 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 
                /* recursively visitbp() blocks below this */
                dnp = buf->b_data;
-               for (i = 0; i < epb && err == 0; i++, dnp++) {
+               for (i = 0; i < epb; i++, dnp++) {
                        err = traverse_dnode(td, dnp, buf, zb->zb_objset,
                            zb->zb_blkid * epb + i);
-                       if (err)
-                               break;
+                       if (err) {
+                               if (!hard)
+                                       break;
+                               lasterr = err;
+                       }
                }
        } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
                uint32_t flags = ARC_WAIT;
                objset_phys_t *osp;
                dnode_phys_t *dnp;
 
-               err = arc_read_nolock(NULL, td->td_spa, bp,
+               err = dsl_read_nolock(NULL, td->td_spa, bp,
                    arc_getbuf_func, &buf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                if (err)
@@ -224,12 +228,21 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
                traverse_zil(td, &osp->os_zil_header);
 
                dnp = &osp->os_meta_dnode;
-               err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+               err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+                   DMU_META_DNODE_OBJECT);
+               if (err && hard) {
+                       lasterr = err;
+                       err = 0;
+               }
                if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
                        dnp = &osp->os_userused_dnode;
                        err = traverse_dnode(td, dnp, buf, zb->zb_objset,
                            DMU_USERUSED_OBJECT);
                }
+               if (err && hard) {
+                       lasterr = err;
+                       err = 0;
+               }
                if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
                        dnp = &osp->os_groupused_dnode;
                        err = traverse_dnode(td, dnp, buf, zb->zb_objset,
@@ -240,33 +253,52 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
        if (buf)
                (void) arc_buf_remove_ref(buf, &buf);
 
-       if (err == 0 && (td->td_flags & TRAVERSE_POST))
-               err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+       if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+               err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+                   td->td_arg);
+       }
 
-       return (err);
+       return (err != 0 ? err : lasterr);
 }
 
 static int
 traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
     arc_buf_t *buf, uint64_t objset, uint64_t object)
 {
-       int j, err = 0;
+       int j, err = 0, lasterr = 0;
        zbookmark_t czb;
+       boolean_t hard = (td->td_flags & TRAVERSE_HARD);
 
        for (j = 0; j < dnp->dn_nblkptr; j++) {
                SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
                err = traverse_visitbp(td, dnp, buf,
                    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
-               if (err)
-                       break;
+               if (err) {
+                       if (!hard)
+                               break;
+                       lasterr = err;
+               }
        }
-       return (err);
+
+       if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               SET_BOOKMARK(&czb, objset,
+                   object, 0, DMU_SPILL_BLKID);
+               err = traverse_visitbp(td, dnp, buf,
+                   (blkptr_t *)&dnp->dn_spill, &czb);
+               if (err) {
+                       if (!hard)
+                               return (err);
+                       lasterr = err;
+               }
+       }
+       return (err != 0 ? err : lasterr);
 }
 
 /* ARGSUSED */
 static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
+    void *arg)
 {
        struct prefetch_data *pfd = arg;
        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@@ -276,7 +308,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
                return (EINTR);
 
        if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-           BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+           BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+           BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
                return (0);
 
        mutex_enter(&pfd->pd_mtx);
@@ -286,7 +319,7 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
        cv_broadcast(&pfd->pd_cv);
        mutex_exit(&pfd->pd_mtx);
 
-       (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+       (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
            ZIO_PRIORITY_ASYNC_READ,
            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
            &aflags, zb);
@@ -305,7 +338,8 @@ traverse_prefetch_thread(void *arg)
        td.td_arg = td_main->td_pfd;
        td.td_pfd = NULL;
 
-       SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+       SET_BOOKMARK(&czb, td.td_objset,
+           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
        (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
 
        mutex_enter(&td_main->td_pfd->pd_mtx);
@@ -346,7 +380,8 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
            &td, TQ_NOQUEUE))
                pd.pd_exited = B_TRUE;
 
-       SET_BOOKMARK(&czb, objset, 0, -1, 0);
+       SET_BOOKMARK(&czb, objset,
+           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
        err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
 
        mutex_enter(&pd.pd_mtx);
@@ -378,43 +413,59 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
 {
-       int err;
+       int err, lasterr = 0;
        uint64_t obj;
        dsl_pool_t *dp = spa_get_dsl(spa);
        objset_t *mos = dp->dp_meta_objset;
+       boolean_t hard = (flags & TRAVERSE_HARD);
 
        /* visit the MOS */
        err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
-           0, TRAVERSE_PRE, func, arg);
+           txg_start, flags, func, arg);
        if (err)
                return (err);
 
        /* visit each dataset */
-       for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+       for (obj = 1; err == 0 || (err != ESRCH && hard);
+           err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
                dmu_object_info_t doi;
 
                err = dmu_object_info(mos, obj, &doi);
-               if (err)
-                       return (err);
+               if (err) {
+                       if (!hard)
+                               return (err);
+                       lasterr = err;
+                       continue;
+               }
 
                if (doi.doi_type == DMU_OT_DSL_DATASET) {
                        dsl_dataset_t *ds;
+                       uint64_t txg = txg_start;
+
                        rw_enter(&dp->dp_config_rwlock, RW_READER);
                        err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
                        rw_exit(&dp->dp_config_rwlock);
-                       if (err)
-                               return (err);
-                       err = traverse_dataset(ds,
-                           ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
-                           func, arg);
+                       if (err) {
+                               if (!hard)
+                                       return (err);
+                               lasterr = err;
+                               continue;
+                       }
+                       if (ds->ds_phys->ds_prev_snap_txg > txg)
+                               txg = ds->ds_phys->ds_prev_snap_txg;
+                       err = traverse_dataset(ds, txg, flags, func, arg);
                        dsl_dataset_rele(ds, FTAG);
-                       if (err)
-                               return (err);
+                       if (err) {
+                               if (!hard)
+                                       return (err);
+                               lasterr = err;
+                       }
                }
        }
        if (err == ESRCH)
                err = 0;
-       return (err);
+       return (err != 0 ? err : lasterr);
 }
index c6fbeee..5fc062c 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
 #include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
+#include <sys/varargs.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
@@ -48,6 +50,8 @@ dmu_tx_create_dd(dsl_dir_t *dd)
                tx->tx_pool = dd->dd_pool;
        list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
            offsetof(dmu_tx_hold_t, txh_node));
+       list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+           offsetof(dmu_tx_callback_t, dcb_node));
 #ifdef ZFS_DEBUG
        refcount_create(&tx->tx_space_written);
        refcount_create(&tx->tx_space_freed);
@@ -58,9 +62,9 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 dmu_tx_t *
 dmu_tx_create(objset_t *os)
 {
-       dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+       dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
        tx->tx_objset = os;
-       tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+       tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
        return (tx);
 }
 
@@ -98,7 +102,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
        int err;
 
        if (object != DMU_NEW_OBJECT) {
-               err = dnode_hold(os->os, object, tx, &dn);
+               err = dnode_hold(os, object, tx, &dn);
                if (err) {
                        tx->tx_err = err;
                        return (NULL);
@@ -161,38 +165,47 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 }
 
 static void
-dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
-    boolean_t freeable, dmu_buf_impl_t **history)
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+    int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 {
-       int i = db->db_level + 1;
-       dnode_t *dn = db->db_dnode;
-
-       if (i >= dn->dn_nlevels)
+       objset_t *os = dn->dn_objset;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
+       int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+       dmu_buf_impl_t *parent = NULL;
+       blkptr_t *bp = NULL;
+       uint64_t space;
+
+       if (level >= dn->dn_nlevels || history[level] == blkid)
                return;
 
-       db = db->db_parent;
-       if (db == NULL) {
-               uint64_t lvls = dn->dn_nlevels - i;
+       history[level] = blkid;
 
-               txh->txh_space_towrite += lvls << dn->dn_indblkshift;
-               return;
+       space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
+
+       if (db == NULL || db == dn->dn_dbuf) {
+               ASSERT(level != 0);
+               db = NULL;
+       } else {
+               ASSERT(db->db_dnode == dn);
+               ASSERT(db->db_level == level);
+               ASSERT(db->db.db_size == space);
+               ASSERT(db->db_blkid == blkid);
+               bp = db->db_blkptr;
+               parent = db->db_parent;
        }
 
-       if (db != history[i]) {
-               dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-               uint64_t space = 1ULL << dn->dn_indblkshift;
+       freeable = (bp && (freeable ||
+           dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 
-               freeable = (db->db_blkptr && (freeable ||
-                   dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
-               if (freeable)
-                       txh->txh_space_tooverwrite += space;
-               else
-                       txh->txh_space_towrite += space;
-               if (db->db_blkptr)
-                       txh->txh_space_tounref += space;
-               history[i] = db;
-               dmu_tx_count_indirects(txh, db, freeable, history);
-       }
+       if (freeable)
+               txh->txh_space_tooverwrite += space;
+       else
+               txh->txh_space_towrite += space;
+       if (bp)
+               txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+       dmu_tx_count_twig(txh, dn, parent, level + 1,
+           blkid >> epbs, freeable, history);
 }
 
 /* ARGSUSED */
@@ -213,7 +226,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
        max_ibs = DN_MAX_INDBLKSHIFT;
 
        if (dn) {
-               dmu_buf_impl_t *last[DN_MAX_LEVELS];
+               uint64_t history[DN_MAX_LEVELS];
                int nlvls = dn->dn_nlevels;
                int delta;
 
@@ -289,29 +302,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                 * If this write is not off the end of the file
                 * we need to account for overwrites/unref.
                 */
-               if (start <= dn->dn_maxblkid)
-                       bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+               if (start <= dn->dn_maxblkid) {
+                       for (int l = 0; l < DN_MAX_LEVELS; l++)
+                               history[l] = -1ULL;
+               }
                while (start <= dn->dn_maxblkid) {
-                       spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-                       dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                        dmu_buf_impl_t *db;
 
                        rw_enter(&dn->dn_struct_rwlock, RW_READER);
-                       db = dbuf_hold_level(dn, 0, start, FTAG);
+                       err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
                        rw_exit(&dn->dn_struct_rwlock);
-                       if (db->db_blkptr && dsl_dataset_block_freeable(ds,
-                           db->db_blkptr->blk_birth)) {
-                               dprintf_bp(db->db_blkptr, "can free old%s", "");
-                               txh->txh_space_tooverwrite += dn->dn_datablksz;
-                               txh->txh_space_tounref += dn->dn_datablksz;
-                               dmu_tx_count_indirects(txh, db, TRUE, last);
-                       } else {
-                               txh->txh_space_towrite += dn->dn_datablksz;
-                               if (db->db_blkptr)
-                                       txh->txh_space_tounref +=
-                                           bp_get_dasize(spa, db->db_blkptr);
-                               dmu_tx_count_indirects(txh, db, FALSE, last);
+
+                       if (err) {
+                               txh->txh_tx->tx_err = err;
+                               return;
                        }
+
+                       dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+                           history);
                        dbuf_rele(db, FTAG);
                        if (++start > end) {
                                /*
@@ -376,13 +384,13 @@ static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
        dnode_t *dn = txh->txh_dnode;
-       dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+       dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
        uint64_t space = mdn->dn_datablksz +
            ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
        if (dn && dn->dn_dbuf->db_blkptr &&
            dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-           dn->dn_dbuf->db_blkptr->blk_birth)) {
+           dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
                txh->txh_space_tooverwrite += space;
                txh->txh_space_tounref += space;
        } else {
@@ -427,7 +435,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
         * The struct_rwlock protects us against dn_nlevels
         * changing, in case (against all odds) we manage to dirty &
         * sync out the changes after we check for being dirty.
-        * Also, dbuf_hold_level() wants us to have the struct_rwlock.
+        * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
         */
        rw_enter(&dn->dn_struct_rwlock, RW_READER);
        epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -457,9 +465,9 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                        blkptr_t *bp = dn->dn_phys->dn_blkptr;
                        ASSERT3U(blkid + i, <, dn->dn_nblkptr);
                        bp += blkid + i;
-                       if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+                       if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
                                dprintf_bp(bp, "can free old%s", "");
-                               space += bp_get_dasize(spa, bp);
+                               space += bp_get_dsize(spa, bp);
                        }
                        unref += BP_GET_ASIZE(bp);
                }
@@ -515,14 +523,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                blkoff = P2PHASE(blkid, epb);
                tochk = MIN(epb - blkoff, nblks);
 
-               dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
-
-               txh->txh_memory_tohold += dbuf->db.db_size;
-               if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
-                       txh->txh_tx->tx_err = E2BIG;
-                       dbuf_rele(dbuf, FTAG);
+               err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+               if (err) {
+                       txh->txh_tx->tx_err = err;
                        break;
                }
+
+               txh->txh_memory_tohold += dbuf->db.db_size;
+
+               /*
+                * We don't check memory_tohold against DMU_MAX_ACCESS because
+                * memory_tohold is an over-estimation (especially the >L1
+                * indirect blocks), so it could fail.  Callers should have
+                * already verified that they will not be holding too much
+                * memory.
+                */
+
                err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
                if (err != 0) {
                        txh->txh_tx->tx_err = err;
@@ -534,9 +550,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                bp += blkoff;
 
                for (i = 0; i < tochk; i++) {
-                       if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+                       if (dsl_dataset_block_freeable(ds, &bp[i],
+                           bp[i].blk_birth)) {
                                dprintf_bp(&bp[i], "can free old%s", "");
-                               space += bp_get_dasize(spa, &bp[i]);
+                               space += bp_get_dsize(spa, &bp[i]);
                        }
                        unref += BP_GET_ASIZE(bp);
                }
@@ -581,6 +598,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
        if (len != DMU_OBJECT_END)
                dmu_tx_count_write(txh, off+len, 1);
 
+       dmu_tx_count_dnode(txh);
+
        if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
                return;
        if (len == DMU_OBJECT_END)
@@ -623,7 +642,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
                }
        }
 
-       dmu_tx_count_dnode(txh);
        dmu_tx_count_free(txh, off, len);
 }
 
@@ -673,6 +691,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                 * the size will change between now and the dbuf dirty call.
                 */
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+                   &dn->dn_phys->dn_blkptr[0],
                    dn->dn_phys->dn_blkptr[0].blk_birth)) {
                        txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
                } else {
@@ -688,7 +707,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                 * access the name in this fat-zap so that we'll check
                 * for i/o errors to the leaf blocks, etc.
                 */
-               err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+               err = zap_lookup(dn->dn_objset, dn->dn_object, name,
                    8, 0, NULL);
                if (err == EIO) {
                        tx->tx_err = err;
@@ -696,7 +715,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                }
        }
 
-       err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
+       err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
            &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
        /*
@@ -771,7 +790,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
        dnode_t *dn = db->db_dnode;
 
        ASSERT(tx->tx_txg != 0);
-       ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+       ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
        ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
        if (tx->tx_anyobj)
@@ -808,10 +827,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
                                        match_offset = TRUE;
                                /*
                                 * We will let this hold work for the bonus
-                                * buffer so that we don't need to hold it
-                                * when creating a new object.
+                                * or spill buffer so that we don't need to
+                                * hold it when creating a new object.
                                 */
-                               if (blkid == DB_BONUS_BLKID)
+                               if (blkid == DMU_BONUS_BLKID ||
+                                   blkid == DMU_SPILL_BLKID)
                                        match_offset = TRUE;
                                /*
                                 * They might have to increase nlevels,
@@ -832,8 +852,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
                                    txh->txh_arg2 == DMU_OBJECT_END))
                                        match_offset = TRUE;
                                break;
+                       case THT_SPILL:
+                               if (blkid == DMU_SPILL_BLKID)
+                                       match_offset = TRUE;
+                               break;
                        case THT_BONUS:
-                               if (blkid == DB_BONUS_BLKID)
+                               if (blkid == DMU_BONUS_BLKID)
                                        match_offset = TRUE;
                                break;
                        case THT_ZAP:
@@ -931,7 +955,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
         * assume that we won't be able to free or overwrite anything.
         */
        if (tx->tx_objset &&
-           dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+           dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
            tx->tx_lastsnap_txg) {
                towrite += tooverwrite;
                tooverwrite = tofree = 0;
@@ -1112,8 +1136,13 @@ dmu_tx_commit(dmu_tx_t *tx)
        if (tx->tx_tempreserve_cookie)
                dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
 
+       if (!list_is_empty(&tx->tx_callbacks))
+               txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
        if (tx->tx_anyobj == FALSE)
                txg_rele_to_sync(&tx->tx_txgh);
+
+       list_destroy(&tx->tx_callbacks);
        list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
        dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
@@ -1142,6 +1171,14 @@ dmu_tx_abort(dmu_tx_t *tx)
                if (dn != NULL)
                        dnode_rele(dn, tx);
        }
+
+       /*
+        * Call any registered callbacks with an error code.
+        */
+       if (!list_is_empty(&tx->tx_callbacks))
+               dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+       list_destroy(&tx->tx_callbacks);
        list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
        refcount_destroy_many(&tx->tx_space_written,
@@ -1158,3 +1195,169 @@ dmu_tx_get_txg(dmu_tx_t *tx)
        ASSERT(tx->tx_txg != 0);
        return (tx->tx_txg);
 }
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+       dmu_tx_callback_t *dcb;
+
+       dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+       dcb->dcb_func = func;
+       dcb->dcb_data = data;
+
+       list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+       dmu_tx_callback_t *dcb;
+
+       while (dcb = list_head(cb_list)) {
+               list_remove(cb_list, dcb);
+               dcb->dcb_func(dcb->dcb_data, error);
+               kmem_free(dcb, sizeof (dmu_tx_callback_t));
+       }
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed.  If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+       int i;
+
+       if (!sa->sa_need_attr_registration)
+               return;
+
+       for (i = 0; i != sa->sa_num_attrs; i++) {
+               if (!sa->sa_attr_table[i].sa_registered) {
+                       if (sa->sa_reg_attr_obj)
+                               dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+                                   B_TRUE, sa->sa_attr_table[i].sa_name);
+                       else
+                               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+                                   B_TRUE, sa->sa_attr_table[i].sa_name);
+               }
+       }
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+       dnode_t *dn;
+       dmu_tx_hold_t *txh;
+       blkptr_t *bp;
+
+       txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+           THT_SPILL, 0, 0);
+
+       dn = txh->txh_dnode;
+
+       if (dn == NULL)
+               return;
+
+       /* If blkptr doesn't exist then add space to towrite */
+       bp = &dn->dn_phys->dn_spill;
+       if (BP_IS_HOLE(bp)) {
+               txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+               txh->txh_space_tounref = 0;
+       } else {
+               if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+                   bp, bp->blk_birth))
+                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+               else
+                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+               if (bp->blk_birth)
+                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+       }
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+       sa_os_t *sa = tx->tx_objset->os_sa;
+
+       dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+       if (tx->tx_objset->os_sa->sa_master_obj == 0)
+               return;
+
+       if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+               dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+       else {
+               dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+               dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+       }
+
+       dmu_tx_sa_registration_hold(sa, tx);
+
+       if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+               return;
+
+       (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+           THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function.  It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+       uint64_t object;
+       sa_os_t *sa = tx->tx_objset->os_sa;
+
+       ASSERT(hdl != NULL);
+
+       object = sa_handle_object(hdl);
+
+       dmu_tx_hold_bonus(tx, object);
+
+       if (tx->tx_objset->os_sa->sa_master_obj == 0)
+               return;
+
+       if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+           tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+               dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+               dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+       }
+
+       dmu_tx_sa_registration_hold(sa, tx);
+
+       if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+               dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+       if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
+           ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+               ASSERT(tx->tx_txg == 0);
+               dmu_tx_hold_spill(tx, object);
+       }
+}
index 4d79fe9..37037c3 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/dmu.h>
 #include <sys/dbuf.h>
+#include <sys/kstat.h>
 
 /*
  * I'm against tune-ables, but these should probably exist as tweakable globals
@@ -59,6 +58,41 @@ static zstream_t     *dmu_zfetch_stream_reclaim(zfetch_t *);
 static void            dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
 static int             dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
 
+typedef struct zfetch_stats {
+       kstat_named_t zfetchstat_hits;
+       kstat_named_t zfetchstat_misses;
+       kstat_named_t zfetchstat_colinear_hits;
+       kstat_named_t zfetchstat_colinear_misses;
+       kstat_named_t zfetchstat_stride_hits;
+       kstat_named_t zfetchstat_stride_misses;
+       kstat_named_t zfetchstat_reclaim_successes;
+       kstat_named_t zfetchstat_reclaim_failures;
+       kstat_named_t zfetchstat_stream_resets;
+       kstat_named_t zfetchstat_stream_noresets;
+       kstat_named_t zfetchstat_bogus_streams;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+       { "hits",                       KSTAT_DATA_UINT64 },
+       { "misses",                     KSTAT_DATA_UINT64 },
+       { "colinear_hits",              KSTAT_DATA_UINT64 },
+       { "colinear_misses",            KSTAT_DATA_UINT64 },
+       { "stride_hits",                KSTAT_DATA_UINT64 },
+       { "stride_misses",              KSTAT_DATA_UINT64 },
+       { "reclaim_successes",          KSTAT_DATA_UINT64 },
+       { "reclaim_failures",           KSTAT_DATA_UINT64 },
+       { "streams_resets",             KSTAT_DATA_UINT64 },
+       { "streams_noresets",           KSTAT_DATA_UINT64 },
+       { "bogus_streams",              KSTAT_DATA_UINT64 },
+};
+
+#define        ZFETCHSTAT_INCR(stat, val) \
+       atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
+
+#define        ZFETCHSTAT_BUMP(stat)           ZFETCHSTAT_INCR(stat, 1);
+
+kstat_t                *zfetch_ksp;
+
 /*
  * Given a zfetch structure and a zstream structure, determine whether the
  * blocks to be read are part of a co-linear pair of existing prefetch
@@ -192,7 +226,30 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
                        break;
        }
        zs->zst_ph_offset = prefetch_tail;
-       zs->zst_last = lbolt;
+       zs->zst_last = ddi_get_lbolt();
+}
+
+void
+zfetch_init(void)
+{
+
+       zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+           KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+           KSTAT_FLAG_VIRTUAL);
+
+       if (zfetch_ksp != NULL) {
+               zfetch_ksp->ks_data = &zfetch_stats;
+               kstat_install(zfetch_ksp);
+       }
+}
+
+void
+zfetch_fini(void)
+{
+       if (zfetch_ksp != NULL) {
+               kstat_delete(zfetch_ksp);
+               zfetch_ksp = NULL;
+       }
 }
 
 /*
@@ -265,7 +322,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
 }
 
 /*
- * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * given a zfetch and a zstream structure, see if there is an associated zstream
  * for this block read.  If so, it starts a prefetch for the stream it
  * located and returns true, otherwise it returns false
  */
@@ -297,6 +354,7 @@ top:
                 */
                if (zs->zst_len == 0) {
                        /* bogus stream */
+                       ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
                        continue;
                }
 
@@ -306,9 +364,14 @@ top:
                 */
                if (zh->zst_offset >= zs->zst_offset &&
                    zh->zst_offset < zs->zst_offset + zs->zst_len) {
-                       /* already fetched */
-                       rc = 1;
-                       goto out;
+                       if (prefetched) {
+                               /* already fetched */
+                               ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
+                               rc = 1;
+                               goto out;
+                       } else {
+                               ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
+                       }
                }
 
                /*
@@ -413,6 +476,7 @@ top:
                if (reset) {
                        zstream_t *remove = zs;
 
+                       ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
                        rc = 0;
                        mutex_exit(&zs->zst_lock);
                        rw_exit(&zf->zf_rwlock);
@@ -431,6 +495,7 @@ top:
                                }
                        }
                } else {
+                       ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
                        rc = 1;
                        dmu_zfetch_dofetch(zf, zs);
                        mutex_exit(&zs->zst_lock);
@@ -487,13 +552,12 @@ dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
                zs_next = list_next(&zf->zf_stream, zs_walk);
 
                if (dmu_zfetch_streams_equal(zs_walk, zs)) {
-                   return (0);
+                       return (0);
                }
        }
 
        list_insert_head(&zf->zf_stream, zs);
        zf->zf_stream_cnt++;
-
        return (1);
 }
 
@@ -513,7 +577,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf)
        for (zs = list_head(&zf->zf_stream); zs;
            zs = list_next(&zf->zf_stream, zs)) {
 
-               if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+               if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
                        break;
        }
 
@@ -597,8 +661,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
            P2ALIGN(offset, blksz)) >> blkshft;
 
        fetched = dmu_zfetch_find(zf, &zst, prefetched);
-       if (!fetched) {
-               fetched = dmu_zfetch_colinear(zf, &zst);
+       if (fetched) {
+               ZFETCHSTAT_BUMP(zfetchstat_hits);
+       } else {
+               ZFETCHSTAT_BUMP(zfetchstat_misses);
+               if (fetched = dmu_zfetch_colinear(zf, &zst)) {
+                       ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
+               } else {
+                       ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
+               }
        }
 
        if (!fetched) {
@@ -608,11 +679,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
                 * we still couldn't find a stream, drop the lock, and allocate
                 * one if possible.  Otherwise, give up and go home.
                 */
-               if (newstream == NULL) {
+               if (newstream) {
+                       ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
+               } else {
                        uint64_t        maxblocks;
                        uint32_t        max_streams;
                        uint32_t        cur_streams;
 
+                       ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
                        cur_streams = zf->zf_stream_cnt;
                        maxblocks = zf->zf_dnode->dn_maxblkid;
 
@@ -625,7 +699,6 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
                        if (cur_streams >= max_streams) {
                                return;
                        }
-
                        newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
                }
 
@@ -635,7 +708,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
                newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
                newstream->zst_cap = zst.zst_len;
                newstream->zst_direction = ZFETCH_FORWARD;
-               newstream->zst_last = lbolt;
+               newstream->zst_last = ddi_get_lbolt();
 
                mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
 
index d82e72a..c16902d 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -210,6 +209,11 @@ dnode_byteswap(dnode_phys_t *dnp)
                ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
                dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
        }
+
+       /* Swap SPILL block if we have one */
+       if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+               byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
 }
 
 void
@@ -258,6 +262,27 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
        rw_exit(&dn->dn_struct_rwlock);
 }
 
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+       ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+       dnode_setdirty(dn, tx);
+       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+       dn->dn_bonustype = newtype;
+       dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+       rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+       ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+       ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+       dnode_setdirty(dn, tx);
+       dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+       dn->dn_have_spill = B_FALSE;
+}
+
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
@@ -272,7 +297,7 @@ dnode_setdblksz(dnode_t *dn, int size)
 }
 
 static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object)
 {
        dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
@@ -294,6 +319,8 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
        dn->dn_maxblkid = dnp->dn_maxblkid;
+       dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+       dn->dn_id_flags = 0;
 
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
@@ -309,7 +336,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 static void
 dnode_destroy(dnode_t *dn)
 {
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
 
 #ifdef ZFS_DEBUG
        int i;
@@ -321,7 +348,7 @@ dnode_destroy(dnode_t *dn)
        }
        ASSERT(NULL == list_head(&dn->dn_dbufs));
 #endif
-       ASSERT(dn->dn_oldphys == NULL);
+       ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
        mutex_enter(&os->os_lock);
        list_remove(&os->os_dnodes, dn);
@@ -368,6 +395,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT(ot != DMU_OT_NONE);
        ASSERT3U(ot, <, DMU_OT_NUMTYPES);
        ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+           (bonustype == DMU_OT_SA && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0));
        ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
        ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
@@ -383,6 +411,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
                ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
                ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
                ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+               ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+               ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
                ASSERT3U(dn->dn_next_blksz[i], ==, 0);
                ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
                ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -393,7 +423,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        dnode_setdblksz(dn, blocksize);
        dn->dn_indblkshift = ibs;
        dn->dn_nlevels = 1;
-       dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+               dn->dn_nblkptr = 1;
+       else
+               dn->dn_nblkptr = 1 +
+                   ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -407,10 +441,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        }
 
        dn->dn_allocated_txg = tx->tx_txg;
+       dn->dn_id_flags = 0;
 
        dnode_setdirty(dn, tx);
        dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
        dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+       dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
        dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
@@ -426,13 +462,16 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
        ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-           (bonustype != DMU_OT_NONE && bonuslen != 0));
+           (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+           (bonustype == DMU_OT_SA && bonuslen == 0));
        ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
        ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
        /* clean up any unreferenced dbufs */
        dnode_evict_dbufs(dn);
 
+       dn->dn_id_flags = 0;
+
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
        dnode_setdirty(dn, tx);
        if (dn->dn_datablksz != blocksize) {
@@ -445,9 +484,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        }
        if (dn->dn_bonuslen != bonuslen)
                dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
-       nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+
+       if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+               nblkptr = 1;
+       else
+               nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       if (dn->dn_bonustype != bonustype)
+               dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
                dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+       if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               dbuf_rm_spill(dn, tx);
+               dnode_rm_spill(dn, tx);
+       }
        rw_exit(&dn->dn_struct_rwlock);
 
        /* change type */
@@ -488,7 +537,7 @@ dnode_special_close(dnode_t *dn)
 }
 
 dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
 {
        dnode_t *dn = dnode_create(os, dnp, NULL, object);
        DNODE_VERIFY(dn);
@@ -535,7 +584,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
  * succeeds even for free dnodes.
  */
 int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag,
     void *tag, dnode_t **dnp)
 {
        int epb, idx, err;
@@ -548,9 +597,14 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 
        /*
         * If you are holding the spa config lock as writer, you shouldn't
-        * be asking the DMU to do *anything*.
+        * be asking the DMU to do *anything* unless it's the root pool
+        * which may require us to read from the root filesystem while
+        * holding some (not all) of the locks as writer.
         */
-       ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+       ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+           (spa_is_root(os->os_spa) &&
+           spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) &&
+           !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER)));
 
        if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
                dn = (object == DMU_USERUSED_OBJECT) ?
@@ -627,7 +681,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
        if (dn->dn_free_txg ||
            ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
            ((flag & DNODE_MUST_BE_FREE) &&
-           (type != DMU_OT_NONE || dn->dn_oldphys))) {
+           (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
                mutex_exit(&dn->dn_mtx);
                dbuf_rele(db, FTAG);
                return (type == DMU_OT_NONE ? ENOENT : EEXIST);
@@ -650,7 +704,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
        return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
@@ -689,7 +743,7 @@ dnode_rele(dnode_t *dn, void *tag)
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
        uint64_t txg = tx->tx_txg;
 
        if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
@@ -706,6 +760,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
        mutex_exit(&dn->dn_mtx);
 #endif
 
+       /*
+        * Determine old uid/gid when necessary
+        */
+       dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
        mutex_enter(&os->os_lock);
 
        /*
@@ -720,6 +779,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
        ASSERT(dn->dn_datablksz != 0);
        ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
        ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+       ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
 
        dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
            dn->dn_object, txg);
@@ -814,7 +874,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
        for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
                db_next = list_next(&dn->dn_dbufs, db);
 
-               if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
+               if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+                   db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);
                        goto fail;
                }
@@ -858,7 +919,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
        int epbs, new_nlevels;
        uint64_t sz;
 
-       ASSERT(blkid != DB_BONUS_BLKID);
+       ASSERT(blkid != DMU_BONUS_BLKID);
 
        ASSERT(have_read ?
            RW_READ_HELD(&dn->dn_struct_rwlock) :
@@ -905,6 +966,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 
                /* dirty the left indirects */
                db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+               ASSERT(db != NULL);
                new = dbuf_dirty(db, tx);
                dbuf_rele(db, FTAG);
 
@@ -915,7 +977,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
                for (dr = list_head(list); dr; dr = dr_next) {
                        dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
                        if (dr->dr_dbuf->db_level != new_nlevels-1 &&
-                           dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+                           dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+                           dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
                                ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
                                list_remove(&dn->dn_dirty_records[txgoff], dr);
                                list_insert_tail(&new->dt.di.dr_children, dr);
@@ -1170,6 +1233,20 @@ out:
        rw_exit(&dn->dn_struct_rwlock);
 }
 
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+       int i;
+
+       mutex_enter(&dn->dn_mtx);
+       for (i = 0; i < TXG_SIZE; i++) {
+               if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+                       break;
+       }
+       mutex_exit(&dn->dn_mtx);
+       return (i < TXG_SIZE);
+}
+
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
@@ -1178,7 +1255,7 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
        void *dp = spa_get_dsl(dn->dn_objset->os_spa);
        int i;
 
-       if (blkid == DB_BONUS_BLKID)
+       if (blkid == DMU_BONUS_BLKID)
                return (FALSE);
 
        /*
@@ -1191,6 +1268,9 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
        if (dn->dn_free_txg)
                return (TRUE);
 
+       if (blkid == DMU_SPILL_BLKID)
+               return (dnode_spill_freed(dn));
+
        range_tofind.fr_blkid = blkid;
        mutex_enter(&dn->dn_mtx);
        for (i = 0; i < TXG_SIZE; i++) {
@@ -1248,7 +1328,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
-       objset_impl_t *os = dn->dn_objset;
+       objset_t *os = dn->dn_objset;
        dsl_dataset_t *ds = os->os_dsl_dataset;
 
        if (space > 0)
index 184fe29..f9ec9f6 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -120,7 +119,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
                if (BP_IS_HOLE(bp))
                        continue;
 
-               bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+               bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
                ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
                bzero(bp, sizeof (blkptr_t));
                blocks_freed += 1;
@@ -228,7 +227,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
        if (db->db_state != DB_CACHED)
                (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
-       arc_release(db->db_buf, db);
+       dbuf_release_bp(db);
        bp = (blkptr_t *)db->db.db_data;
 
        epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -424,6 +423,9 @@ dnode_undirty_dbufs(list_t *list)
                dmu_buf_impl_t *db = dr->dr_dbuf;
                uint64_t txg = dr->dr_txg;
 
+               if (db->db_level != 0)
+                       dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
                mutex_enter(&db->db_mtx);
                /* XXX - use dbuf_undirty()? */
                list_remove(list, dr);
@@ -431,16 +433,12 @@ dnode_undirty_dbufs(list_t *list)
                db->db_last_dirty = NULL;
                db->db_dirtycnt -= 1;
                if (db->db_level == 0) {
-                       ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+                       ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
                            dr->dt.dl.dr_data == db->db_buf);
                        dbuf_unoverride(dr);
-                       mutex_exit(&db->db_mtx);
-               } else {
-                       mutex_exit(&db->db_mtx);
-                       dnode_undirty_dbufs(&dr->dt.di.dr_children);
                }
                kmem_free(dr, sizeof (dbuf_dirty_record_t));
-               dbuf_rele(db, (void *)(uintptr_t)txg);
+               dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
        }
 }
 
@@ -491,6 +489,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
        dn->dn_maxblkid = 0;
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
+       dn->dn_have_spill = B_FALSE;
        mutex_exit(&dn->dn_mtx);
 
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -513,6 +512,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
        int txgoff = tx->tx_txg & TXG_MASK;
        list_t *list = &dn->dn_dirty_records[txgoff];
        static const dnode_phys_t zerodn = { 0 };
+       boolean_t kill_spill = B_FALSE;
 
        ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -524,10 +524,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 
        if (dmu_objset_userused_enabled(dn->dn_objset) &&
            !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-               ASSERT(dn->dn_oldphys == NULL);
-               dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
-               *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+               mutex_enter(&dn->dn_mtx);
+               dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+               dn->dn_oldflags = dn->dn_phys->dn_flags;
                dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+               mutex_exit(&dn->dn_mtx);
+               dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
        } else {
                /* Once we account for it, we should always account for it. */
                ASSERT(!(dn->dn_phys->dn_flags &
@@ -558,6 +560,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
                    SPA_MINBLOCKSIZE) == 0);
                ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
                    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+                   avl_last(&dn->dn_ranges[txgoff]) ||
                    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
                    dnp->dn_datablkszsec);
                dnp->dn_datablkszsec =
@@ -574,6 +577,24 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
                dn->dn_next_bonuslen[txgoff] = 0;
        }
 
+       if (dn->dn_next_bonustype[txgoff]) {
+               ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+               dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+               dn->dn_next_bonustype[txgoff] = 0;
+       }
+
+       /*
+        * We will either remove a spill block when a file is being removed
+        * or we have been asked to remove it.
+        */
+       if (dn->dn_rm_spillblk[txgoff] ||
+           ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+           dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
+               if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+                       kill_spill = B_TRUE;
+               dn->dn_rm_spillblk[txgoff] = 0;
+       }
+
        if (dn->dn_next_indblkshift[txgoff]) {
                ASSERT(dnp->dn_nlevels == 1);
                dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -590,6 +611,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 
        mutex_exit(&dn->dn_mtx);
 
+       if (kill_spill) {
+               (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+               mutex_enter(&dn->dn_mtx);
+               dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+               mutex_exit(&dn->dn_mtx);
+       }
+
        /* process all the "freed" ranges in the file */
        while (rp = avl_last(&dn->dn_ranges[txgoff])) {
                dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
index edc36e7..ddd8357 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
-#include <sys/sunddi.h>
 #include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
 
 static char *dsl_reaper = "the grim reaper";
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
+#define        SWITCH64(x, y) \
+       { \
+               uint64_t __tmp = (x); \
+               (x) = (y); \
+               (y) = __tmp; \
+       }
+
 #define        DS_REF_MAX      (1ULL << 62)
 
 #define        DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
@@ -77,14 +87,14 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
 }
 
 void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
-       int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+       int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
        int compressed = BP_GET_PSIZE(bp);
        int uncompressed = BP_GET_UCSIZE(bp);
        int64_t delta;
 
-       dprintf_bp(bp, "born, ds=%p\n", ds);
+       dprintf_bp(bp, "ds=%p", ds);
 
        ASSERT(dmu_tx_is_syncing(tx));
        /* It could have been compressed away to nothing */
@@ -104,6 +114,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
                return;
        }
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
        mutex_enter(&ds->ds_dir->dd_lock);
        mutex_enter(&ds->ds_lock);
        delta = parent_delta(ds, used);
@@ -120,29 +131,26 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 }
 
 int
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
-    dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+    boolean_t async)
 {
-       int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
-       int compressed = BP_GET_PSIZE(bp);
-       int uncompressed = BP_GET_UCSIZE(bp);
-
-       ASSERT(pio != NULL);
-       ASSERT(dmu_tx_is_syncing(tx));
-       /* No block pointer => nothing to free */
        if (BP_IS_HOLE(bp))
                return (0);
 
+       ASSERT(dmu_tx_is_syncing(tx));
+       ASSERT(bp->blk_birth <= tx->tx_txg);
+
+       int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+       int compressed = BP_GET_PSIZE(bp);
+       int uncompressed = BP_GET_UCSIZE(bp);
+
        ASSERT(used > 0);
        if (ds == NULL) {
-               int err;
                /*
                 * Account for the meta-objset space in its placeholder
                 * dataset.
                 */
-               err = dsl_free(pio, tx->tx_pool,
-                   tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
-               ASSERT(err == 0);
+               dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
                dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
                    -used, -compressed, -uncompressed, tx);
@@ -155,13 +163,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
        if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
-               int err;
                int64_t delta;
 
-               dprintf_bp(bp, "freeing: %s", "");
-               err = dsl_free(pio, tx->tx_pool,
-                   tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
-               ASSERT(err == 0);
+               dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+               dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
                mutex_enter(&ds->ds_dir->dd_lock);
                mutex_enter(&ds->ds_lock);
@@ -177,7 +182,18 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
                mutex_exit(&ds->ds_dir->dd_lock);
        } else {
                dprintf_bp(bp, "putting on dead list: %s", "");
-               VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+               if (async) {
+                       /*
+                        * We are here as part of zio's write done callback,
+                        * which means we're a zio interrupt thread.  We can't
+                        * call dsl_deadlist_insert() now because it may block
+                        * waiting for I/O.  Instead, put bp on the deferred
+                        * queue and let dsl_pool_sync() finish the job.
+                        */
+                       bplist_append(&ds->ds_pending_deadlist, bp);
+               } else {
+                       dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+               }
                ASSERT3U(ds->ds_prev->ds_object, ==,
                    ds->ds_phys->ds_prev_snap_obj);
                ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
@@ -190,7 +206,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
                        ds->ds_prev->ds_phys->ds_unique_bytes += used;
                        mutex_exit(&ds->ds_prev->ds_lock);
                }
-               if (bp->blk_birth > ds->ds_origin_txg) {
+               if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
                        dsl_dir_transfer_space(ds->ds_dir, used,
                            DD_USED_HEAD, DD_USED_SNAP, tx);
                }
@@ -231,9 +247,16 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 }
 
 boolean_t
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+    uint64_t blk_birth)
 {
-       return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+       if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
+               return (B_FALSE);
+
+       if (zfs_dedup_prefetch && bp && BP_GET_DEDUP(bp))
+               ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+
+       return (B_TRUE);
 }
 
 /* ARGSUSED */
@@ -244,19 +267,23 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 
        ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 
-       dprintf_ds(ds, "evicting %s\n", "");
-
        unique_remove(ds->ds_fsid_guid);
 
-       if (ds->ds_user_ptr != NULL)
-               ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+       if (ds->ds_objset != NULL)
+               dmu_objset_evict(ds->ds_objset);
 
        if (ds->ds_prev) {
                dsl_dataset_drop_ref(ds->ds_prev, ds);
                ds->ds_prev = NULL;
        }
 
-       bplist_close(&ds->ds_deadlist);
+       bplist_destroy(&ds->ds_pending_deadlist);
+       if (db != NULL) {
+               dsl_deadlist_close(&ds->ds_deadlist);
+       } else {
+               ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
+               ASSERT(!ds->ds_deadlist.dl_oldfmt);
+       }
        if (ds->ds_dir)
                dsl_dir_close(ds->ds_dir, ds);
 
@@ -265,7 +292,6 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
        mutex_destroy(&ds->ds_lock);
        mutex_destroy(&ds->ds_recvlock);
        mutex_destroy(&ds->ds_opening_lock);
-       mutex_destroy(&ds->ds_deadlist.bpl_lock);
        rw_destroy(&ds->ds_rwlock);
        cv_destroy(&ds->ds_exclusive_cv);
 
@@ -325,6 +351,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
        matchtype_t mt;
        int err;
 
+       dsl_dir_snap_cmtime_update(ds->ds_dir);
+
        if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
                mt = MT_FIRST;
        else
@@ -363,28 +391,25 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
                mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
                mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
-               mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
-                   NULL);
                rw_init(&ds->ds_rwlock, 0, 0, 0);
                cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 
-               err = bplist_open(&ds->ds_deadlist,
+               bplist_create(&ds->ds_pending_deadlist);
+               dsl_deadlist_open(&ds->ds_deadlist,
                    mos, ds->ds_phys->ds_deadlist_obj);
+
                if (err == 0) {
                        err = dsl_dir_open_obj(dp,
                            ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
                }
                if (err) {
-                       /*
-                        * we don't really need to close the blist if we
-                        * just opened it.
-                        */
                        mutex_destroy(&ds->ds_lock);
                        mutex_destroy(&ds->ds_recvlock);
                        mutex_destroy(&ds->ds_opening_lock);
-                       mutex_destroy(&ds->ds_deadlist.bpl_lock);
                        rw_destroy(&ds->ds_rwlock);
                        cv_destroy(&ds->ds_exclusive_cv);
+                       bplist_destroy(&ds->ds_pending_deadlist);
+                       dsl_deadlist_close(&ds->ds_deadlist);
                        kmem_free(ds, sizeof (dsl_dataset_t));
                        dmu_buf_rele(dbuf, tag);
                        return (err);
@@ -397,19 +422,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                                    ds->ds_phys->ds_prev_snap_obj,
                                    ds, &ds->ds_prev);
                        }
-
-                       if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
-                               dsl_dataset_t *origin;
-
-                               err = dsl_dataset_hold_obj(dp,
-                                   ds->ds_dir->dd_phys->dd_origin_obj,
-                                   FTAG, &origin);
-                               if (err == 0) {
-                                       ds->ds_origin_txg =
-                                           origin->ds_phys->ds_creation_txg;
-                                       dsl_dataset_rele(origin, FTAG);
-                               }
-                       }
                } else {
                        if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
                                err = dsl_dataset_get_snapname(ds);
@@ -454,14 +466,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                            dsl_dataset_evict);
                }
                if (err || winner) {
-                       bplist_close(&ds->ds_deadlist);
+                       bplist_destroy(&ds->ds_pending_deadlist);
+                       dsl_deadlist_close(&ds->ds_deadlist);
                        if (ds->ds_prev)
                                dsl_dataset_drop_ref(ds->ds_prev, ds);
                        dsl_dir_close(ds->ds_dir, ds);
                        mutex_destroy(&ds->ds_lock);
                        mutex_destroy(&ds->ds_recvlock);
                        mutex_destroy(&ds->ds_opening_lock);
-                       mutex_destroy(&ds->ds_deadlist.bpl_lock);
                        rw_destroy(&ds->ds_rwlock);
                        cv_destroy(&ds->ds_exclusive_cv);
                        kmem_free(ds, sizeof (dsl_dataset_t));
@@ -531,7 +543,15 @@ dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
                        rw_enter(&dp->dp_config_rwlock, RW_READER);
                        return (ENOENT);
                }
+               /*
+                * The dp_config_rwlock lives above the ds_lock. And
+                * we need to check DSL_DATASET_IS_DESTROYED() while
+                * holding the ds_lock, so we have to drop and reacquire
+                * the ds_lock here.
+                */
+               mutex_exit(&ds->ds_lock);
                rw_enter(&dp->dp_config_rwlock, RW_READER);
+               mutex_enter(&ds->ds_lock);
        }
        mutex_exit(&ds->ds_lock);
        return (0);
@@ -549,17 +569,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 }
 
 int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
-    dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp)
 {
-       int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
-
-       ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
-
+       int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
        if (err)
                return (err);
-       if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
-               dsl_dataset_rele(*dsp, owner);
+       if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+               dsl_dataset_rele(*dsp, tag);
                *dsp = NULL;
                return (EBUSY);
        }
@@ -626,18 +643,14 @@ out:
 }
 
 int
-dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp)
 {
-       int err = dsl_dataset_hold(name, owner, dsp);
+       int err = dsl_dataset_hold(name, tag, dsp);
        if (err)
                return (err);
-       if ((*dsp)->ds_phys->ds_num_children > 0 &&
-           !DS_MODE_IS_READONLY(flags)) {
-               dsl_dataset_rele(*dsp, owner);
-               return (EROFS);
-       }
-       if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
-               dsl_dataset_rele(*dsp, owner);
+       if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+               dsl_dataset_rele(*dsp, tag);
                return (EBUSY);
        }
        return (0);
@@ -709,9 +722,9 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 }
 
 void
-dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-       ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+       ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
            (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 
        mutex_enter(&ds->ds_lock);
@@ -722,20 +735,20 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
        }
        mutex_exit(&ds->ds_lock);
        if (ds->ds_dbuf)
-               dsl_dataset_drop_ref(ds, owner);
+               dsl_dataset_drop_ref(ds, tag);
        else
-               dsl_dataset_evict(ds->ds_dbuf, ds);
+               dsl_dataset_evict(NULL, ds);
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 {
        boolean_t gotit = FALSE;
 
        mutex_enter(&ds->ds_lock);
        if (ds->ds_owner == NULL &&
            (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
-               ds->ds_owner = owner;
+               ds->ds_owner = tag;
                if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
                        rw_exit(&ds->ds_rwlock);
                gotit = TRUE;
@@ -786,10 +799,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
            DMU_OT_NONE, 0, tx);
        dsphys->ds_creation_time = gethrestime_sec();
        dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
-       dsphys->ds_deadlist_obj =
-           bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 
-       if (origin) {
+       if (origin == NULL) {
+               dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+       } else {
+               dsl_dataset_t *ohds;
+
                dsphys->ds_prev_snap_obj = origin->ds_object;
                dsphys->ds_prev_snap_txg =
                    origin->ds_phys->ds_creation_txg;
@@ -805,6 +820,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                dmu_buf_will_dirty(origin->ds_dbuf, tx);
                origin->ds_phys->ds_num_children++;
 
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+                   origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+               dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+                   dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+               dsl_dataset_rele(ohds, FTAG);
+
                if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
                        if (origin->ds_phys->ds_next_clones_obj == 0) {
                                origin->ds_phys->ds_next_clones_obj =
@@ -818,6 +839,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 
                dmu_buf_will_dirty(dd->dd_dbuf, tx);
                dd->dd_phys->dd_origin_obj = origin->ds_object;
+               if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+                       if (origin->ds_dir->dd_phys->dd_clones == 0) {
+                               dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+                               origin->ds_dir->dd_phys->dd_clones =
+                                   zap_create(mos,
+                                   DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+                       }
+                       VERIFY3U(0, ==, zap_add_int(mos,
+                           origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+               }
        }
 
        if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
@@ -861,29 +892,20 @@ struct destroyarg {
 };
 
 static int
-dsl_snapshot_destroy_one(char *name, void *arg)
+dsl_snapshot_destroy_one(const char *name, void *arg)
 {
        struct destroyarg *da = arg;
        dsl_dataset_t *ds;
        int err;
        char *dsname;
-       size_t buflen;
-
-       /* alloc a buffer to hold name@snapname, plus the terminating NULL */
-       buflen = strlen(name) + strlen(da->snapname) + 2;
-       dsname = kmem_alloc(buflen, KM_SLEEP);
-       (void) snprintf(dsname, buflen, "%s@%s", name, da->snapname);
-       err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-           da->dstg, &ds);
-       kmem_free(dsname, buflen);
+
+       dsname = kmem_asprintf("%s@%s", name, da->snapname);
+       err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds);
+       strfree(dsname);
        if (err == 0) {
                struct dsl_ds_destroyarg *dsda;
 
                dsl_dataset_make_exclusive(ds, da->dstg);
-               if (ds->ds_user_ptr) {
-                       ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-                       ds->ds_user_ptr = NULL;
-               }
                dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
                dsda->ds = ds;
                dsda->defer = da->defer;
@@ -959,25 +981,6 @@ dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
        return (might_destroy);
 }
 
-#ifdef _KERNEL
-static int
-dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name)
-{
-       int error;
-       objset_t *os;
-
-       error = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
-       if (error)
-               return (error);
-
-       if (dmu_objset_type(os) == DMU_OST_ZVOL)
-               error = zvol_remove_minor(name);
-       dmu_objset_close(os);
-
-       return (error);
-}
-#endif
-
 /*
  * If we're removing a clone, and these three conditions are true:
  *     1) the clone's origin has no other children
@@ -1005,15 +1008,8 @@ dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
                        kmem_free(name, namelen);
                        return (error);
                }
-               error = dsl_dataset_zvol_cleanup(origin, name);
-               if (error) {
-                       kmem_free(name, namelen);
-                       return (error);
-               }
 #endif
-               error = dsl_dataset_own(name,
-                   DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-                   tag, &origin);
+               error = dsl_dataset_own(name, B_TRUE, tag, &origin);
                kmem_free(name, namelen);
                if (error)
                        return (error);
@@ -1036,7 +1032,8 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
        objset_t *os;
        dsl_dir_t *dd;
        uint64_t obj;
-       struct dsl_ds_destroyarg dsda = {0};
+       struct dsl_ds_destroyarg dsda = { 0 };
+       dsl_dataset_t dummy_ds = { 0 };
 
        dsda.ds = ds;
 
@@ -1044,20 +1041,20 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
                /* Destroying a snapshot is simpler */
                dsl_dataset_make_exclusive(ds, tag);
 
-               if (ds->ds_user_ptr) {
-                       ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-                       ds->ds_user_ptr = NULL;
-               }
-               /* NOTE: defer is always B_FALSE for non-snapshots */
                dsda.defer = defer;
                err = dsl_sync_task_do(ds->ds_dir->dd_pool,
                    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
                    &dsda, tag, 0);
                ASSERT3P(dsda.rm_origin, ==, NULL);
                goto out;
+       } else if (defer) {
+               err = EINVAL;
+               goto out;
        }
 
        dd = ds->ds_dir;
+       dummy_ds.ds_dir = dd;
+       dummy_ds.ds_object = ds->ds_object;
 
        /*
         * Check for errors and mark this ds as inconsistent, in
@@ -1068,7 +1065,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
        if (err)
                goto out;
 
-       err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+       err = dmu_objset_from_ds(ds, &os);
        if (err)
                goto out;
 
@@ -1097,7 +1094,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
         * context, the user space accounting should be zero.
         */
        if (ds->ds_phys->ds_bp.blk_fill == 0 &&
-           dmu_objset_userused_enabled(os->os)) {
+           dmu_objset_userused_enabled(os)) {
                uint64_t count;
 
                ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
@@ -1106,7 +1103,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
                    count == 0);
        }
 
-       dmu_objset_close(os);
        if (err != ESRCH)
                goto out;
 
@@ -1117,24 +1113,10 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
        if (err)
                goto out;
 
-       if (ds->ds_user_ptr) {
-               /*
-                * We need to sync out all in-flight IO before we try
-                * to evict (the dataset evict func is trying to clear
-                * the cached entries for this dataset in the ARC).
-                */
-               txg_wait_synced(dd->dd_pool, 0);
-       }
-
        /*
         * Blow away the dsl_dir + head dataset.
         */
        dsl_dataset_make_exclusive(ds, tag);
-       if (ds->ds_user_ptr) {
-               ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-               ds->ds_user_ptr = NULL;
-       }
-
        /*
         * If we're removing a clone, we might also need to remove its
         * origin.
@@ -1153,7 +1135,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
                dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
                    dsl_dataset_destroy_sync, &dsda, tag, 0);
                dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-                   dsl_dir_destroy_sync, dd, FTAG, 0);
+                   dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
                err = dsl_sync_task_group_wait(dstg);
                dsl_sync_task_group_destroy(dstg);
 
@@ -1181,47 +1163,6 @@ out:
        return (err);
 }
 
-int
-dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
-{
-       int err;
-
-       ASSERT(ds->ds_owner);
-
-       dsl_dataset_make_exclusive(ds, ds->ds_owner);
-       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
-           ds, &ost, 0);
-       /* drop exclusive access */
-       mutex_enter(&ds->ds_lock);
-       rw_exit(&ds->ds_rwlock);
-       cv_broadcast(&ds->ds_exclusive_cv);
-       mutex_exit(&ds->ds_lock);
-       return (err);
-}
-
-void *
-dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func)
-{
-       void *old;
-
-       mutex_enter(&ds->ds_lock);
-       old = ds->ds_user_ptr;
-       if (old == NULL) {
-               ds->ds_user_ptr = p;
-               ds->ds_user_evict_func = func;
-       }
-       mutex_exit(&ds->ds_lock);
-       return (old);
-}
-
-void *
-dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
-{
-       return (ds->ds_user_ptr);
-}
-
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
@@ -1255,7 +1196,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
        if (ds == NULL) /* this is the meta-objset */
                return;
 
-       ASSERT(ds->ds_user_ptr != NULL);
+       ASSERT(ds->ds_objset != NULL);
 
        if (ds->ds_phys->ds_next_snap_obj != 0)
                panic("dirtying snapshot!");
@@ -1282,62 +1223,51 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
        uint64_t mrs_used;
        uint64_t dlused, dlcomp, dluncomp;
 
-       ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+       ASSERT(!dsl_dataset_is_snapshot(ds));
 
        if (ds->ds_phys->ds_prev_snap_obj != 0)
                mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
        else
                mrs_used = 0;
 
-       VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
-           &dluncomp));
+       dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
        ASSERT3U(dlused, <=, mrs_used);
        ds->ds_phys->ds_unique_bytes =
            ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
 
-       if (!DS_UNIQUE_IS_ACCURATE(ds) &&
-           spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+       if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
            SPA_VERSION_UNIQUE_ACCURATE)
                ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
-static uint64_t
-dsl_dataset_unique(dsl_dataset_t *ds)
-{
-       if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
-               dsl_dataset_recalc_head_uniq(ds);
-
-       return (ds->ds_phys->ds_unique_bytes);
-}
-
 struct killarg {
        dsl_dataset_t *ds;
-       zio_t *zio;
        dmu_tx_t *tx;
 };
 
 /* ARGSUSED */
 static int
-kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
        struct killarg *ka = arg;
+       dmu_tx_t *tx = ka->tx;
 
        if (bp == NULL)
                return (0);
 
-       if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
-           (zb->zb_object != 0 && dnp == NULL)) {
+       if (zb->zb_level == ZB_ZIL_LEVEL) {
+               ASSERT(zilog != NULL);
                /*
                 * It's a block in the intent log.  It has no
                 * accounting, so just free it.
                 */
-               VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
-                   ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
+               dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
        } else {
+               ASSERT(zilog == NULL);
                ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
-               (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+               (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
        }
 
        return (0);
@@ -1345,145 +1275,6 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 
 /* ARGSUSED */
 static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       dmu_objset_type_t *ost = arg2;
-
-       /*
-        * We can only roll back to emptyness if it is a ZPL objset.
-        */
-       if (*ost != DMU_OST_ZFS &&
-           ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL)
-               return (EINVAL);
-
-       /*
-        * This must not be a snapshot.
-        */
-       if (ds->ds_phys->ds_next_snap_obj != 0)
-               return (EINVAL);
-
-       /*
-        * If we made changes this txg, traverse_dataset won't find
-        * them.  Try again.
-        */
-       if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-               return (EAGAIN);
-
-       return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-       dmu_objset_type_t *ost = arg2;
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-       if (ds->ds_user_ptr != NULL) {
-               /*
-                * We need to make sure that the objset_impl_t is reopened after
-                * we do the rollback, otherwise it will have the wrong
-                * objset_phys_t.  Normally this would happen when this
-                * dataset-open is closed, thus causing the
-                * dataset to be immediately evicted.  But when doing "zfs recv
-                * -F", we reopen the objset before that, so that there is no
-                * window where the dataset is closed and inconsistent.
-                */
-               ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-               ds->ds_user_ptr = NULL;
-       }
-
-       /* Transfer space that was freed since last snap back to the head. */
-       {
-               uint64_t used;
-
-               VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
-                   ds->ds_origin_txg, UINT64_MAX, &used));
-               dsl_dir_transfer_space(ds->ds_dir, used,
-                   DD_USED_SNAP, DD_USED_HEAD, tx);
-       }
-
-       /* Zero out the deadlist. */
-       bplist_close(&ds->ds_deadlist);
-       bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
-       ds->ds_phys->ds_deadlist_obj =
-           bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-       VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-           ds->ds_phys->ds_deadlist_obj));
-
-       {
-               /*
-                * Free blkptrs that we gave birth to - this covers
-                * claimed but not played log blocks too.
-                */
-               zio_t *zio;
-               struct killarg ka;
-
-               zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
-                   ZIO_FLAG_MUSTSUCCEED);
-               ka.ds = ds;
-               ka.zio = zio;
-               ka.tx = tx;
-               (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-                   TRAVERSE_POST, kill_blkptr, &ka);
-               (void) zio_wait(zio);
-       }
-
-       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
-
-       if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
-               /* Change our contents to that of the prev snapshot */
-
-               ASSERT3U(ds->ds_prev->ds_object, ==,
-                   ds->ds_phys->ds_prev_snap_obj);
-               ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
-                   ds->ds_prev->ds_phys->ds_used_bytes);
-
-               ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
-               ds->ds_phys->ds_used_bytes =
-                   ds->ds_prev->ds_phys->ds_used_bytes;
-               ds->ds_phys->ds_compressed_bytes =
-                   ds->ds_prev->ds_phys->ds_compressed_bytes;
-               ds->ds_phys->ds_uncompressed_bytes =
-                   ds->ds_prev->ds_phys->ds_uncompressed_bytes;
-               ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-
-               if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-                       dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-                       ds->ds_prev->ds_phys->ds_unique_bytes = 0;
-               }
-       } else {
-               objset_impl_t *osi;
-
-               ASSERT(*ost != DMU_OST_ZVOL);
-               ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
-               ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
-               ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
-
-               bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
-               ds->ds_phys->ds_flags = 0;
-               ds->ds_phys->ds_unique_bytes = 0;
-               if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-                   SPA_VERSION_UNIQUE_ACCURATE)
-                       ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
-               osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
-                   &ds->ds_phys->ds_bp, *ost, tx);
-#ifdef _KERNEL
-               zfs_create_fs(&osi->os, kcred, NULL, tx);
-#endif
-       }
-
-       spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
-           tx, cr, "dataset = %llu", ds->ds_object);
-}
-
-/* ARGSUSED */
-static int
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
@@ -1498,7 +1289,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
         */
        if (ds->ds_prev != NULL &&
            ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-               return (EINVAL);
+               return (EBUSY);
 
        /*
         * This is really a dsl_dir thing, but check it here so that
@@ -1516,7 +1307,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
        dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1525,8 +1316,8 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
        ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
-       spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
-           cr, "dataset = %llu", ds->ds_object);
+       spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+           "dataset = %llu", ds->ds_object);
 }
 
 static int
@@ -1594,7 +1385,7 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
         */
        if (ds->ds_prev != NULL &&
            ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-               return (EINVAL);
+               return (EBUSY);
 
        /*
         * If we made changes this txg, traverse_dsl_dataset won't find
@@ -1672,12 +1463,135 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
        cv_destroy(&arg.cv);
 }
 
+static void
+remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
+{
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       uint64_t count;
+       int err;
+
+       ASSERT(ds->ds_phys->ds_num_children >= 2);
+       err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+       /*
+        * The err should not be ENOENT, but a bug in a previous version
+        * of the code could cause upgrade_clones_cb() to not set
+        * ds_next_snap_obj when it should, leading to a missing entry.
+        * If we knew that the pool was created after
+        * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+        * ENOENT.  However, at least we can check that we don't have
+        * too many entries in the next_clones_obj even after failing to
+        * remove this one.
+        */
+       if (err != ENOENT) {
+               VERIFY3U(err, ==, 0);
+       }
+       ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+           &count));
+       ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+}
+
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       /*
+        * If it is the old version, dd_clones doesn't exist so we can't
+        * find the clones, but deadlist_remove_key() is a no-op so it
+        * doesn't matter.
+        */
+       if (ds->ds_dir->dd_phys->dd_clones == 0)
+               return;
+
+       for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc)) {
+               dsl_dataset_t *clone;
+
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+                   za.za_first_integer, FTAG, &clone));
+               if (clone->ds_dir->dd_origin_txg > mintxg) {
+                       dsl_deadlist_remove_key(&clone->ds_deadlist,
+                           mintxg, tx);
+                       dsl_dataset_remove_clones_key(clone, mintxg, tx);
+               }
+               dsl_dataset_rele(clone, FTAG);
+       }
+       zap_cursor_fini(&zc);
+}
+
+struct process_old_arg {
+       dsl_dataset_t *ds;
+       dsl_dataset_t *ds_prev;
+       boolean_t after_branch_point;
+       zio_t *pio;
+       uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       struct process_old_arg *poa = arg;
+       dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+       if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+               dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+               if (poa->ds_prev && !poa->after_branch_point &&
+                   bp->blk_birth >
+                   poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+                       poa->ds_prev->ds_phys->ds_unique_bytes +=
+                           bp_get_dsize_sync(dp->dp_spa, bp);
+               }
+       } else {
+               poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+               poa->comp += BP_GET_PSIZE(bp);
+               poa->uncomp += BP_GET_UCSIZE(bp);
+               dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+       }
+       return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+       struct process_old_arg poa = { 0 };
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
+
+       ASSERT(ds->ds_deadlist.dl_oldfmt);
+       ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+       poa.ds = ds;
+       poa.ds_prev = ds_prev;
+       poa.after_branch_point = after_branch_point;
+       poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+       VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+           process_old_cb, &poa, tx));
+       VERIFY3U(zio_wait(poa.pio), ==, 0);
+       ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+       /* change snapused */
+       dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+           -poa.used, -poa.comp, -poa.uncomp, tx);
+
+       /* swap next's deadlist to our deadlist */
+       dsl_deadlist_close(&ds->ds_deadlist);
+       dsl_deadlist_close(&ds_next->ds_deadlist);
+       SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
+           ds->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+           ds_next->ds_phys->ds_deadlist_obj);
+}
+
 void
-dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
        struct dsl_ds_destroyarg *dsda = arg1;
        dsl_dataset_t *ds = dsda->ds;
-       zio_t *zio;
        int err;
        int after_branch_point = FALSE;
        dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1706,16 +1620,28 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
        cv_broadcast(&ds->ds_exclusive_cv);
        mutex_exit(&ds->ds_lock);
 
+       if (ds->ds_objset) {
+               dmu_objset_evict(ds->ds_objset);
+               ds->ds_objset = NULL;
+       }
+
        /* Remove our reservation */
        if (ds->ds_reserved != 0) {
-               uint64_t val = 0;
-               dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+               dsl_prop_setarg_t psa;
+               uint64_t value = 0;
+
+               dsl_prop_setarg_init_uint64(&psa, "refreservation",
+                   (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+                   &value);
+               psa.psa_effective_value = 0;    /* predict default value */
+
+               dsl_dataset_set_reservation_sync(ds, &psa, tx);
                ASSERT3U(ds->ds_reserved, ==, 0);
        }
 
        ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
-       dsl_pool_ds_destroyed(ds, tx);
+       dsl_scan_ds_destroyed(ds, tx);
 
        obj = ds->ds_object;
 
@@ -1732,8 +1658,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
                if (after_branch_point &&
                    ds_prev->ds_phys->ds_next_clones_obj != 0) {
-                       VERIFY3U(0, ==, zap_remove_int(mos,
-                           ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+                       remove_from_next_clones(ds_prev, obj, tx);
                        if (ds->ds_phys->ds_next_snap_obj != 0) {
                                VERIFY(0 == zap_add_int(mos,
                                    ds_prev->ds_phys->ds_next_clones_obj,
@@ -1765,20 +1690,16 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                }
        }
 
-       zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
-       if (ds->ds_phys->ds_next_snap_obj != 0) {
-               blkptr_t bp;
+       if (dsl_dataset_is_snapshot(ds)) {
                dsl_dataset_t *ds_next;
-               uint64_t itor = 0;
                uint64_t old_unique;
-               int64_t used = 0, compressed = 0, uncompressed = 0;
+               uint64_t used = 0, comp = 0, uncomp = 0;
 
                VERIFY(0 == dsl_dataset_hold_obj(dp,
                    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
                ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
-               old_unique = dsl_dataset_unique(ds_next);
+               old_unique = ds_next->ds_phys->ds_unique_bytes;
 
                dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
                ds_next->ds_phys->ds_prev_snap_obj =
@@ -1788,53 +1709,49 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
                    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
 
-               /*
-                * Transfer to our deadlist (which will become next's
-                * new deadlist) any entries from next's current
-                * deadlist which were born before prev, and free the
-                * other entries.
-                *
-                * XXX we're doing this long task with the config lock held
-                */
-               while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
-                       if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
-                               VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
-                                   &bp, tx));
-                               if (ds_prev && !after_branch_point &&
-                                   bp.blk_birth >
-                                   ds_prev->ds_phys->ds_prev_snap_txg) {
-                                       ds_prev->ds_phys->ds_unique_bytes +=
-                                           bp_get_dasize(dp->dp_spa, &bp);
-                               }
-                       } else {
-                               used += bp_get_dasize(dp->dp_spa, &bp);
-                               compressed += BP_GET_PSIZE(&bp);
-                               uncompressed += BP_GET_UCSIZE(&bp);
-                               /* XXX check return value? */
-                               (void) dsl_free(zio, dp, tx->tx_txg,
-                                   &bp, NULL, NULL, ARC_NOWAIT);
-                       }
-               }
 
-               ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+               if (ds_next->ds_deadlist.dl_oldfmt) {
+                       process_old_deadlist(ds, ds_prev, ds_next,
+                           after_branch_point, tx);
+               } else {
+                       /* Adjust prev's unique space. */
+                       if (ds_prev && !after_branch_point) {
+                               dsl_deadlist_space_range(&ds_next->ds_deadlist,
+                                   ds_prev->ds_phys->ds_prev_snap_txg,
+                                   ds->ds_phys->ds_prev_snap_txg,
+                                   &used, &comp, &uncomp);
+                               ds_prev->ds_phys->ds_unique_bytes += used;
+                       }
 
-               /* change snapused */
-               dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-                   -used, -compressed, -uncompressed, tx);
+                       /* Adjust snapused. */
+                       dsl_deadlist_space_range(&ds_next->ds_deadlist,
+                           ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+                           &used, &comp, &uncomp);
+                       dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+                           -used, -comp, -uncomp, tx);
+
+                       /* Move blocks to be freed to pool's free list. */
+                       dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+                           &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+                           tx);
+                       dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+                           DD_USED_HEAD, used, comp, uncomp, tx);
+                       dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
+
+                       /* Merge our deadlist into next's and free it. */
+                       dsl_deadlist_merge(&ds_next->ds_deadlist,
+                           ds->ds_phys->ds_deadlist_obj, tx);
+               }
+               dsl_deadlist_close(&ds->ds_deadlist);
+               dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 
-               /* free next's deadlist */
-               bplist_close(&ds_next->ds_deadlist);
-               bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+               /* Collapse range in clone heads */
+               dsl_dataset_remove_clones_key(ds,
+                   ds->ds_phys->ds_creation_txg, tx);
 
-               /* set next's deadlist to our deadlist */
-               bplist_close(&ds->ds_deadlist);
-               ds_next->ds_phys->ds_deadlist_obj =
-                   ds->ds_phys->ds_deadlist_obj;
-               VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
-                   ds_next->ds_phys->ds_deadlist_obj));
-               ds->ds_phys->ds_deadlist_obj = 0;
+               if (dsl_dataset_is_snapshot(ds_next)) {
+                       dsl_dataset_t *ds_nextnext;
 
-               if (ds_next->ds_phys->ds_next_snap_obj != 0) {
                        /*
                         * Update next's unique to include blocks which
                         * were previously shared by only this snapshot
@@ -1843,25 +1760,27 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                         * died after the next snap and before the one
                         * after that (ie. be on the snap after next's
                         * deadlist).
-                        *
-                        * XXX we're doing this long task with the
-                        * config lock held
                         */
-                       dsl_dataset_t *ds_after_next;
-                       uint64_t space;
-
                        VERIFY(0 == dsl_dataset_hold_obj(dp,
                            ds_next->ds_phys->ds_next_snap_obj,
-                           FTAG, &ds_after_next));
-
-                       VERIFY(0 ==
-                           bplist_space_birthrange(&ds_after_next->ds_deadlist,
+                           FTAG, &ds_nextnext));
+                       dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
                            ds->ds_phys->ds_prev_snap_txg,
-                           ds->ds_phys->ds_creation_txg, &space));
-                       ds_next->ds_phys->ds_unique_bytes += space;
-
-                       dsl_dataset_rele(ds_after_next, FTAG);
+                           ds->ds_phys->ds_creation_txg,
+                           &used, &comp, &uncomp);
+                       ds_next->ds_phys->ds_unique_bytes += used;
+                       dsl_dataset_rele(ds_nextnext, FTAG);
                        ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+                       /* Collapse range in this head. */
+                       dsl_dataset_t *hds;
+                       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+                           ds->ds_dir->dd_phys->dd_head_dataset_obj,
+                           FTAG, &hds));
+                       dsl_deadlist_remove_key(&hds->ds_deadlist,
+                           ds->ds_phys->ds_creation_txg, tx);
+                       dsl_dataset_rele(hds, FTAG);
+
                } else {
                        ASSERT3P(ds_next->ds_prev, ==, ds);
                        dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
@@ -1901,9 +1820,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                 */
                struct killarg ka;
 
-               ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
-               bplist_close(&ds->ds_deadlist);
-               bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+               dsl_deadlist_close(&ds->ds_deadlist);
+               dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
                ds->ds_phys->ds_deadlist_obj = 0;
 
                /*
@@ -1914,17 +1832,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                 * freed all the objects in open context.
                 */
                ka.ds = ds;
-               ka.zio = zio;
                ka.tx = tx;
                err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
                    TRAVERSE_POST, kill_blkptr, &ka);
                ASSERT3U(err, ==, 0);
                ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
                    ds->ds_phys->ds_unique_bytes == 0);
-       }
 
-       err = zio_wait(zio);
-       ASSERT3U(err, ==, 0);
+               if (ds->ds_prev != NULL) {
+                       if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+                               VERIFY3U(0, ==, zap_remove_int(mos,
+                                   ds->ds_prev->ds_dir->dd_phys->dd_clones,
+                                   ds->ds_object, tx));
+                       }
+                       dsl_dataset_rele(ds->ds_prev, ds);
+                       ds->ds_prev = ds_prev = NULL;
+               }
+       }
 
        if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
                /* Erase the link in the dir */
@@ -1959,8 +1883,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                dsl_dataset_rele(ds_prev, FTAG);
 
        spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-       spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
-           cr, "dataset = %llu", ds->ds_object);
+       spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
+           "dataset = %llu", ds->ds_object);
 
        if (ds->ds_phys->ds_next_clones_obj != 0) {
                uint64_t count;
@@ -1982,20 +1906,10 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                /*
                 * Remove the origin of the clone we just destroyed.
                 */
-               dsl_dataset_t *origin = ds->ds_prev;
                struct dsl_ds_destroyarg ndsda = {0};
 
-               ASSERT3P(origin, ==, dsda->rm_origin);
-               if (origin->ds_user_ptr) {
-                       origin->ds_user_evict_func(origin, origin->ds_user_ptr);
-                       origin->ds_user_ptr = NULL;
-               }
-
-               dsl_dataset_rele(origin, tag);
-               ds->ds_prev = NULL;
-
-               ndsda.ds = origin;
-               dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
+               ndsda.ds = dsda->rm_origin;
+               dsl_dataset_destroy_sync(&ndsda, tag, tx);
        }
 }
 
@@ -2012,7 +1926,8 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
         * owned by the snapshot dataset must be accommodated by space
         * outside of the reservation.
         */
-       asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+       ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+       asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
        if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
                return (ENOSPC);
 
@@ -2026,7 +1941,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
        return (0);
 }
 
-/* ARGSUSED */
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -2067,7 +1981,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
        const char *snapname = arg2;
@@ -2125,8 +2039,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                            ds->ds_prev->ds_phys->ds_creation_txg);
                        ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
                } else if (next_clones_obj != 0) {
-                       VERIFY3U(0, ==, zap_remove_int(mos,
-                           next_clones_obj, dsphys->ds_next_snap_obj, tx));
+                       remove_from_next_clones(ds->ds_prev,
+                           dsphys->ds_next_snap_obj, tx);
                        VERIFY3U(0, ==, zap_add_int(mos,
                            next_clones_obj, dsobj, tx));
                }
@@ -2138,25 +2052,31 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
         * since our unique space is going to zero.
         */
        if (ds->ds_reserved) {
-               int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+               int64_t delta;
+               ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+               delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
                dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
-                   add, 0, 0, tx);
+                   delta, 0, 0, tx);
        }
 
-       bplist_close(&ds->ds_deadlist);
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
+           ds->ds_dir->dd_myname, snapname, dsobj,
+           ds->ds_phys->ds_prev_snap_txg);
+       ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
+           UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+       dsl_deadlist_close(&ds->ds_deadlist);
+       dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_add_key(&ds->ds_deadlist,
+           ds->ds_phys->ds_prev_snap_txg, tx);
+
        ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
        ds->ds_phys->ds_prev_snap_obj = dsobj;
        ds->ds_phys->ds_prev_snap_txg = crtxg;
        ds->ds_phys->ds_unique_bytes = 0;
        if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
                ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-       ds->ds_phys->ds_deadlist_obj =
-           bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-       VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-           ds->ds_phys->ds_deadlist_obj));
 
-       dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
        err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
            snapname, 8, 1, &dsobj, tx);
        ASSERT(err == 0);
@@ -2166,9 +2086,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        VERIFY(0 == dsl_dataset_get_ref(dp,
            ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 
-       dsl_pool_ds_snapshotted(ds, tx);
+       dsl_scan_ds_snapshotted(ds, tx);
+
+       dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-       spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+       spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
            "dataset = %llu", dsobj);
 }
 
@@ -2176,7 +2098,7 @@ void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
        ASSERT(dmu_tx_is_syncing(tx));
-       ASSERT(ds->ds_user_ptr != NULL);
+       ASSERT(ds->ds_objset != NULL);
        ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
        /*
@@ -2187,7 +2109,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
        ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
 
        dsl_dir_dirty(ds->ds_dir, tx);
-       dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+       dmu_objset_sync(ds->ds_objset, zio, tx);
 }
 
 void
@@ -2211,7 +2133,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
            ds->ds_reserved);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
            ds->ds_phys->ds_guid);
-       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs);
+       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+           ds->ds_phys->ds_unique_bytes);
+       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+           ds->ds_object);
+       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+           ds->ds_userrefs);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
            DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
@@ -2335,8 +2262,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
        const char *newsnapname = arg2;
@@ -2360,8 +2286,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
            ds->ds_snapname, 8, 1, &ds->ds_object, tx);
        ASSERT3U(err, ==, 0);
 
-       spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
-           cr, "dataset = %llu", ds->ds_object);
+       spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+           "dataset = %llu", ds->ds_object);
        dsl_dataset_rele(hds, FTAG);
 }
 
@@ -2373,43 +2299,36 @@ struct renamesnaparg {
 };
 
 static int
-dsl_snapshot_rename_one(char *name, void *arg)
+dsl_snapshot_rename_one(const char *name, void *arg)
 {
        struct renamesnaparg *ra = arg;
        dsl_dataset_t *ds = NULL;
-       char *cp;
+       char *snapname;
        int err;
 
-       cp = name + strlen(name);
-       *cp = '@';
-       (void) strcpy(cp + 1, ra->oldsnap);
+       snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
+       (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
 
        /*
         * For recursive snapshot renames the parent won't be changing
         * so we just pass name for both the to/from argument.
         */
-       err = zfs_secpolicy_rename_perms(name, name, CRED());
-       if (err == ENOENT) {
-               return (0);
-       } else if (err) {
-               (void) strcpy(ra->failed, name);
-               return (err);
+       err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
+       if (err != 0) {
+               strfree(snapname);
+               return (err == ENOENT ? 0 : err);
        }
 
 #ifdef _KERNEL
        /*
         * For all filesystems undergoing rename, we'll need to unmount it.
         */
-       (void) zfs_unmount_snap(name, NULL);
+       (void) zfs_unmount_snap(snapname, NULL);
 #endif
-       err = dsl_dataset_hold(name, ra->dstg, &ds);
-       *cp = '\0';
-       if (err == ENOENT) {
-               return (0);
-       } else if (err) {
-               (void) strcpy(ra->failed, name);
-               return (err);
-       }
+       err = dsl_dataset_hold(snapname, ra->dstg, &ds);
+       strfree(snapname);
+       if (err != 0)
+               return (err == ENOENT ? 0 : err);
 
        dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
            dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
@@ -2425,7 +2344,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
        dsl_sync_task_t *dst;
        spa_t *spa;
        char *cp, *fsname = spa_strdup(oldname);
-       int len = strlen(oldname);
+       int len = strlen(oldname) + 1;
 
        /* truncate the snapshot name to get the fsname */
        cp = strchr(fsname, '@');
@@ -2433,7 +2352,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 
        err = spa_open(fsname, &spa, FTAG);
        if (err) {
-               kmem_free(fsname, len + 1);
+               kmem_free(fsname, len);
                return (err);
        }
        ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
@@ -2445,7 +2364,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 
        err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
            DS_FIND_CHILDREN);
-       kmem_free(fsname, len + 1);
+       kmem_free(fsname, len);
 
        if (err == 0) {
                err = dsl_sync_task_group_wait(ra->dstg);
@@ -2456,14 +2375,15 @@ dsl_recursive_rename(char *oldname, const char *newname)
                dsl_dataset_t *ds = dst->dst_arg1;
                if (dst->dst_err) {
                        dsl_dir_name(ds->ds_dir, ra->failed);
-                       (void) strcat(ra->failed, "@");
-                       (void) strcat(ra->failed, ra->newsnap);
+                       (void) strlcat(ra->failed, "@", sizeof (ra->failed));
+                       (void) strlcat(ra->failed, ra->newsnap,
+                           sizeof (ra->failed));
                }
                dsl_dataset_rele(ds, ra->dstg);
        }
 
        if (err)
-               (void) strcpy(oldname, ra->failed);
+               (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
 
        dsl_sync_task_group_destroy(ra->dstg);
        kmem_free(ra, sizeof (struct renamesnaparg));
@@ -2472,7 +2392,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 }
 
 static int
-dsl_valid_rename(char *oldname, void *arg)
+dsl_valid_rename(const char *oldname, void *arg)
 {
        int delta = *(int *)arg;
 
@@ -2494,12 +2414,7 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
        err = dsl_dir_open(oldname, FTAG, &dd, &tail);
        if (err)
                return (err);
-       /*
-        * If there are more than 2 references there may be holds
-        * hanging around that haven't been cleared out yet.
-        */
-       if (dmu_buf_refcount(dd->dd_dbuf) > 2)
-               txg_wait_synced(dd->dd_pool, 0);
+
        if (tail == NULL) {
                int delta = strlen(newname) - strlen(oldname);
 
@@ -2508,13 +2423,14 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
                        err = dmu_objset_find(oldname, dsl_valid_rename,
                            &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
-               if (!err)
+               if (err == 0)
                        err = dsl_dir_rename(dd, newname);
                dsl_dir_close(dd, FTAG);
                return (err);
        }
+
        if (tail[0] != '@') {
-               /* the name ended in a nonexistant component */
+               /* the name ended in a nonexistent component */
                dsl_dir_close(dd, FTAG);
                return (ENOENT);
        }
@@ -2553,13 +2469,14 @@ struct promotenode {
 
 struct promotearg {
        list_t shared_snaps, origin_snaps, clone_snaps;
-       dsl_dataset_t *origin_origin, *origin_head;
+       dsl_dataset_t *origin_origin;
        uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+       char *err_ds;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static boolean_t snaplist_unstable(list_t *l);
 
-/* ARGSUSED */
 static int
 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -2568,6 +2485,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
        struct promotenode *snap = list_head(&pa->shared_snaps);
        dsl_dataset_t *origin_ds = snap->ds;
        int err;
+       uint64_t unused;
 
        /* Check that it is a real clone */
        if (!dsl_dir_is_clone(hds->ds_dir))
@@ -2583,10 +2501,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
        /* compute origin's new unique space */
        snap = list_tail(&pa->clone_snaps);
        ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-       err = bplist_space_birthrange(&snap->ds->ds_deadlist,
-           origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
-       if (err)
-               return (err);
+       dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+           origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+           &pa->unique, &unused, &unused);
 
        /*
         * Walk the snapshots that we are moving
@@ -2614,18 +2531,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
                /* Check that the snapshot name does not conflict */
                VERIFY(0 == dsl_dataset_get_snapname(ds));
                err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
-               if (err == 0)
-                       return (EEXIST);
+               if (err == 0) {
+                       err = EEXIST;
+                       goto out;
+               }
                if (err != ENOENT)
-                       return (err);
+                       goto out;
 
                /* The very first snapshot does not have a deadlist */
                if (ds->ds_phys->ds_prev_snap_obj == 0)
                        continue;
 
-               if (err = bplist_space(&ds->ds_deadlist,
-                   &dlused, &dlcomp, &dluncomp))
-                       return (err);
+               dsl_deadlist_space(&ds->ds_deadlist,
+                   &dlused, &dlcomp, &dluncomp);
                pa->used += dlused;
                pa->comp += dlcomp;
                pa->uncomp += dluncomp;
@@ -2658,19 +2576,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
                /*
                 * Note, typically this will not be a clone of a clone,
-                * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
-                * these snaplist_space() -> bplist_space_birthrange()
+                * so dd_origin_txg will be < TXG_INITIAL, so
+                * these snaplist_space() -> dsl_deadlist_space_range()
                 * calls will be fast because they do not have to
                 * iterate over all bps.
                 */
                snap = list_head(&pa->origin_snaps);
                err = snaplist_space(&pa->shared_snaps,
-                   snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+                   snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
                if (err)
                        return (err);
 
                err = snaplist_space(&pa->clone_snaps,
-                   snap->ds->ds_origin_txg, &space);
+                   snap->ds->ds_dir->dd_origin_txg, &space);
                if (err)
                        return (err);
                pa->cloneusedsnap += space;
@@ -2683,10 +2601,13 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
        }
 
        return (0);
+out:
+       pa->err_ds =  snap->ds->ds_snapname;
+       return (err);
 }
 
 static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *hds = arg1;
        struct promotearg *pa = arg2;
@@ -2720,9 +2641,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
        /* change the origin's next clone */
        if (origin_ds->ds_phys->ds_next_clones_obj) {
-               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-                   origin_ds->ds_phys->ds_next_clones_obj,
-                   origin_ds->ds_phys->ds_next_snap_obj, tx));
+               remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
                VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
                    origin_ds->ds_phys->ds_next_clones_obj,
                    oldnext_obj, tx));
@@ -2732,10 +2651,31 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
        ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
        dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
-       hds->ds_origin_txg = origin_head->ds_origin_txg;
+       dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
        dmu_buf_will_dirty(odd->dd_dbuf, tx);
        odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
-       origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+       origin_head->ds_dir->dd_origin_txg =
+           origin_ds->ds_phys->ds_creation_txg;
+
+       /* change dd_clone entries */
+       if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   odd->dd_phys->dd_clones, hds->ds_object, tx));
+               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+                   pa->origin_origin->ds_dir->dd_phys->dd_clones,
+                   hds->ds_object, tx));
+
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   pa->origin_origin->ds_dir->dd_phys->dd_clones,
+                   origin_head->ds_object, tx));
+               if (dd->dd_phys->dd_clones == 0) {
+                       dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
+                           DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+               }
+               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+                   dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+
+       }
 
        /* move snapshots to this dir */
        for (snap = list_head(&pa->shared_snaps); snap;
@@ -2743,9 +2683,9 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                dsl_dataset_t *ds = snap->ds;
 
                /* unregister props as dsl_dir is changing */
-               if (ds->ds_user_ptr) {
-                       ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-                       ds->ds_user_ptr = NULL;
+               if (ds->ds_objset) {
+                       dmu_objset_evict(ds->ds_objset);
+                       ds->ds_objset = NULL;
                }
                /* move snap name entry */
                VERIFY(0 == dsl_dataset_get_snapname(ds));
@@ -2754,6 +2694,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                VERIFY(0 == zap_add(dp->dp_meta_objset,
                    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
                    8, 1, &ds->ds_object, tx));
+
                /* change containing dsl_dir */
                dmu_buf_will_dirty(ds->ds_dbuf, tx);
                ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
@@ -2763,6 +2704,40 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
                    NULL, ds, &ds->ds_dir));
 
+               /* move any clone references */
+               if (ds->ds_phys->ds_next_clones_obj &&
+                   spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+                       zap_cursor_t zc;
+                       zap_attribute_t za;
+
+                       for (zap_cursor_init(&zc, dp->dp_meta_objset,
+                           ds->ds_phys->ds_next_clones_obj);
+                           zap_cursor_retrieve(&zc, &za) == 0;
+                           zap_cursor_advance(&zc)) {
+                               dsl_dataset_t *cnds;
+                               uint64_t o;
+
+                               if (za.za_first_integer == oldnext_obj) {
+                                       /*
+                                        * We've already moved the
+                                        * origin's reference.
+                                        */
+                                       continue;
+                               }
+
+                               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+                                   za.za_first_integer, FTAG, &cnds));
+                               o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+
+                               VERIFY3U(zap_remove_int(dp->dp_meta_objset,
+                                   odd->dd_phys->dd_clones, o, tx), ==, 0);
+                               VERIFY3U(zap_add_int(dp->dp_meta_objset,
+                                   dd->dd_phys->dd_clones, o, tx), ==, 0);
+                               dsl_dataset_rele(cnds, FTAG);
+                       }
+                       zap_cursor_fini(&zc);
+               }
+
                ASSERT3U(dsl_prop_numcb(ds), ==, 0);
        }
 
@@ -2792,8 +2767,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        origin_ds->ds_phys->ds_unique_bytes = pa->unique;
 
        /* log history record */
-       spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
-           cr, "dataset = %llu", hds->ds_object);
+       spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+           "dataset = %llu", hds->ds_object);
 
        dsl_dir_close(odd, FTAG);
 }
@@ -2858,11 +2833,9 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 
        *spacep = 0;
        for (snap = list_head(l); snap; snap = list_next(l, snap)) {
-               uint64_t used;
-               int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
-                   mintxg, UINT64_MAX, &used);
-               if (err)
-                       return (err);
+               uint64_t used, comp, uncomp;
+               dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+                   mintxg, UINT64_MAX, &used, &comp, &uncomp);
                *spacep += used;
        }
        return (0);
@@ -2897,7 +2870,7 @@ snaplist_destroy(list_t *l, boolean_t own)
  * NULL, indicating that the clone is not a clone of a clone).
  */
 int
-dsl_dataset_promote(const char *name)
+dsl_dataset_promote(const char *name, char *conflsnap)
 {
        dsl_dataset_t *ds;
        dsl_dir_t *dd;
@@ -2949,10 +2922,10 @@ dsl_dataset_promote(const char *name)
        if (err != 0)
                goto out;
 
-       if (dsl_dir_is_clone(snap->ds->ds_dir)) {
-               err = dsl_dataset_own_obj(dp,
+       if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+               err = dsl_dataset_hold_obj(dp,
                    snap->ds->ds_dir->dd_phys->dd_origin_obj,
-                   0, FTAG, &pa.origin_origin);
+                   FTAG, &pa.origin_origin);
                if (err != 0)
                        goto out;
        }
@@ -2968,14 +2941,16 @@ out:
        if (err == 0) {
                err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
                    dsl_dataset_promote_sync, ds, &pa,
-                   2 + 2 * doi.doi_physical_blks);
+                   2 + 2 * doi.doi_physical_blocks_512);
+               if (err && pa.err_ds && conflsnap)
+                       (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
        }
 
        snaplist_destroy(&pa.shared_snaps, B_TRUE);
        snaplist_destroy(&pa.clone_snaps, B_FALSE);
        snaplist_destroy(&pa.origin_snaps, B_FALSE);
        if (pa.origin_origin)
-               dsl_dataset_disown(pa.origin_origin, FTAG);
+               dsl_dataset_rele(pa.origin_origin, FTAG);
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
@@ -3002,9 +2977,11 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (csa->cds->ds_prev != csa->ohds->ds_prev)
                return (EINVAL);
 
-       /* cds should be the clone */
-       if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
-           csa->ohds->ds_object)
+       /* cds should be the clone (unless they are unrelated) */
+       if (csa->cds->ds_prev != NULL &&
+           csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
+           csa->ohds->ds_object !=
+           csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
                return (EINVAL);
 
        /* the clone should be a child of the origin */
@@ -3027,38 +3004,49 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
            dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
                return (ENOSPC);
 
+       if (csa->ohds->ds_quota != 0 &&
+           csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
+               return (EDQUOT);
+
        return (0);
 }
 
 /* ARGSUSED */
 static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        struct cloneswaparg *csa = arg1;
        dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
 
        ASSERT(csa->cds->ds_reserved == 0);
-       ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+       ASSERT(csa->ohds->ds_quota == 0 ||
+           csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
 
        dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
        dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
-       dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
 
-       if (csa->cds->ds_user_ptr != NULL) {
-               csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
-               csa->cds->ds_user_ptr = NULL;
+       if (csa->cds->ds_objset != NULL) {
+               dmu_objset_evict(csa->cds->ds_objset);
+               csa->cds->ds_objset = NULL;
        }
 
-       if (csa->ohds->ds_user_ptr != NULL) {
-               csa->ohds->ds_user_evict_func(csa->ohds,
-                   csa->ohds->ds_user_ptr);
-               csa->ohds->ds_user_ptr = NULL;
+       if (csa->ohds->ds_objset != NULL) {
+               dmu_objset_evict(csa->ohds->ds_objset);
+               csa->ohds->ds_objset = NULL;
        }
 
-       /* reset origin's unique bytes */
-       VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-           csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-           &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+       /*
+        * Reset origin's unique bytes, if it exists.
+        */
+       if (csa->cds->ds_prev) {
+               dsl_dataset_t *origin = csa->cds->ds_prev;
+               uint64_t comp, uncomp;
+
+               dmu_buf_will_dirty(origin->ds_dbuf, tx);
+               dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+                   origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+                   &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+       }
 
        /* swap blkptrs */
        {
@@ -3077,10 +3065,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                ASSERT3U(csa->cds->ds_dir->dd_phys->
                    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
-               VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
-                   &cdl_comp, &cdl_uncomp));
-               VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
-                   &odl_comp, &odl_uncomp));
+               dsl_deadlist_space(&csa->cds->ds_deadlist,
+                   &cdl_used, &cdl_comp, &cdl_uncomp);
+               dsl_deadlist_space(&csa->ohds->ds_deadlist,
+                   &odl_used, &odl_comp, &odl_uncomp);
 
                dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
                    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
@@ -3101,21 +3089,16 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                 * deadlist (since that's the only thing that's
                 * changing that affects the snapused).
                 */
-               VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-                   csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
-               VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
-                   csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+               dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+                   csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+                   &cdl_used, &cdl_comp, &cdl_uncomp);
+               dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
+                   csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+                   &odl_used, &odl_comp, &odl_uncomp);
                dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
                    DD_USED_HEAD, DD_USED_SNAP, tx);
        }
 
-#define        SWITCH64(x, y) \
-       { \
-               uint64_t __tmp = (x); \
-               (x) = (y); \
-               (y) = __tmp; \
-       }
-
        /* swap ds_*_bytes */
        SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
            csa->cds->ds_phys->ds_used_bytes);
@@ -3130,22 +3113,26 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
            csa->unused_refres_delta, 0, 0, tx);
 
-       /* swap deadlists */
-       bplist_close(&csa->cds->ds_deadlist);
-       bplist_close(&csa->ohds->ds_deadlist);
+       /*
+        * Swap deadlists.
+        */
+       dsl_deadlist_close(&csa->cds->ds_deadlist);
+       dsl_deadlist_close(&csa->ohds->ds_deadlist);
        SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
            csa->cds->ds_phys->ds_deadlist_obj);
-       VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
-           csa->cds->ds_phys->ds_deadlist_obj));
-       VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
-           csa->ohds->ds_phys->ds_deadlist_obj));
+       dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+           csa->cds->ds_phys->ds_deadlist_obj);
+       dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+           csa->ohds->ds_phys->ds_deadlist_obj);
 
-       dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
+       dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
 }
 
 /*
- * Swap 'clone' with its origin head file system.  Used at the end
- * of "online recv" to swizzle the file system to the new version.
+ * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
+ * recv" into an existing fs to swizzle the file system to the new
+ * version, and by "zfs rollback".  Can also be used to swap two
+ * independent head datasets if neither has any snapshots.
  */
 int
 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
@@ -3254,62 +3241,70 @@ static int
 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       uint64_t *quotap = arg2;
-       uint64_t new_quota = *quotap;
+       dsl_prop_setarg_t *psa = arg2;
+       int err;
 
        if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
                return (ENOTSUP);
 
-       if (new_quota == 0)
+       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+               return (err);
+
+       if (psa->psa_effective_value == 0)
                return (0);
 
-       if (new_quota < ds->ds_phys->ds_used_bytes ||
-           new_quota < ds->ds_reserved)
+       if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+           psa->psa_effective_value < ds->ds_reserved)
                return (ENOSPC);
 
        return (0);
 }
 
-/* ARGSUSED */
+extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
+
 void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       uint64_t *quotap = arg2;
-       uint64_t new_quota = *quotap;
+       dsl_prop_setarg_t *psa = arg2;
+       uint64_t effective_value = psa->psa_effective_value;
 
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-       ds->ds_quota = new_quota;
+       dsl_prop_set_sync(ds, psa, tx);
+       DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
 
-       dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+       if (ds->ds_quota != effective_value) {
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               ds->ds_quota = effective_value;
 
-       spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
-           tx, cr, "%lld dataset = %llu ",
-           (longlong_t)new_quota, ds->ds_object);
+               spa_history_log_internal(LOG_DS_REFQUOTA,
+                   ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
+                   (longlong_t)ds->ds_quota, ds->ds_object);
+       }
 }
 
 int
-dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
 {
        dsl_dataset_t *ds;
+       dsl_prop_setarg_t psa;
        int err;
 
+       dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
+
        err = dsl_dataset_hold(dsname, FTAG, &ds);
        if (err)
                return (err);
 
-       if (quota != ds->ds_quota) {
-               /*
-                * If someone removes a file, then tries to set the quota, we
-                * want to make sure the file freeing takes effect.
-                */
-               txg_wait_open(ds->ds_dir->dd_pool, 0);
+       /*
+        * If someone removes a file, then tries to set the quota, we
+        * want to make sure the file freeing takes effect.
+        */
+       txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+       err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+           dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+           ds, &psa, 0);
 
-               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
-                   ds, &quota, 0);
-       }
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
@@ -3318,9 +3313,10 @@ static int
 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       uint64_t *reservationp = arg2;
-       uint64_t new_reservation = *reservationp;
+       dsl_prop_setarg_t *psa = arg2;
+       uint64_t effective_value;
        uint64_t unique;
+       int err;
 
        if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
            SPA_VERSION_REFRESERVATION)
@@ -3329,6 +3325,11 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (dsl_dataset_is_snapshot(ds))
                return (EINVAL);
 
+       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+               return (err);
+
+       effective_value = psa->psa_effective_value;
+
        /*
         * If we are doing the preliminary check in open context, the
         * space estimates may be inaccurate.
@@ -3337,76 +3338,101 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
                return (0);
 
        mutex_enter(&ds->ds_lock);
-       unique = dsl_dataset_unique(ds);
+       if (!DS_UNIQUE_IS_ACCURATE(ds))
+               dsl_dataset_recalc_head_uniq(ds);
+       unique = ds->ds_phys->ds_unique_bytes;
        mutex_exit(&ds->ds_lock);
 
-       if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) {
-               uint64_t delta = MAX(unique, new_reservation) -
+       if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
+               uint64_t delta = MAX(unique, effective_value) -
                    MAX(unique, ds->ds_reserved);
 
                if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
                        return (ENOSPC);
                if (ds->ds_quota > 0 &&
-                   new_reservation > ds->ds_quota)
+                   effective_value > ds->ds_quota)
                        return (ENOSPC);
        }
 
        return (0);
 }
 
-/* ARGSUSED */
 static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx)
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       uint64_t *reservationp = arg2;
-       uint64_t new_reservation = *reservationp;
+       dsl_prop_setarg_t *psa = arg2;
+       uint64_t effective_value = psa->psa_effective_value;
        uint64_t unique;
        int64_t delta;
 
+       dsl_prop_set_sync(ds, psa, tx);
+       DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
+
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
        mutex_enter(&ds->ds_dir->dd_lock);
        mutex_enter(&ds->ds_lock);
-       unique = dsl_dataset_unique(ds);
-       delta = MAX(0, (int64_t)(new_reservation - unique)) -
+       ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+       unique = ds->ds_phys->ds_unique_bytes;
+       delta = MAX(0, (int64_t)(effective_value - unique)) -
            MAX(0, (int64_t)(ds->ds_reserved - unique));
-       ds->ds_reserved = new_reservation;
+       ds->ds_reserved = effective_value;
        mutex_exit(&ds->ds_lock);
 
        dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
        mutex_exit(&ds->ds_dir->dd_lock);
-       dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refreservation",
-           new_reservation, cr, tx);
 
-       spa_history_internal_log(LOG_DS_REFRESERV,
-           ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
-           (longlong_t)new_reservation, ds->ds_object);
+       spa_history_log_internal(LOG_DS_REFRESERV,
+           ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
+           (longlong_t)effective_value, ds->ds_object);
 }
 
 int
-dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+    uint64_t reservation)
 {
        dsl_dataset_t *ds;
+       dsl_prop_setarg_t psa;
        int err;
 
+       dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
+           &reservation);
+
        err = dsl_dataset_hold(dsname, FTAG, &ds);
        if (err)
                return (err);
 
        err = dsl_sync_task_do(ds->ds_dir->dd_pool,
            dsl_dataset_set_reservation_check,
-           dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+           dsl_dataset_set_reservation_sync, ds, &psa, 0);
+
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
+struct dsl_ds_holdarg {
+       dsl_sync_task_group_t *dstg;
+       char *htag;
+       char *snapname;
+       boolean_t recursive;
+       boolean_t gotone;
+       boolean_t temphold;
+       char failed[MAXPATHLEN];
+};
+
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define        MAX_TAG_PREFIX_LEN      17
+
 static int
 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       char *htag = arg2;
+       struct dsl_ds_holdarg *ha = arg2;
+       char *htag = ha->htag;
        objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
        int error = 0;
 
@@ -3416,9 +3442,6 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (!dsl_dataset_is_snapshot(ds))
                return (EINVAL);
 
-       if (strlen(htag) >= ZAP_MAXNAMELEN)
-               return (ENAMETOOLONG);
-
        /* tags must be unique */
        mutex_enter(&ds->ds_lock);
        if (ds->ds_phys->ds_userrefs_obj) {
@@ -3431,16 +3454,22 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
        }
        mutex_exit(&ds->ds_lock);
 
+       if (error == 0 && ha->temphold &&
+           strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+               error = E2BIG;
+
        return (error);
 }
 
 static void
-dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       char *htag = arg2;
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       time_t now = gethrestime_sec();
+       struct dsl_ds_holdarg *ha = arg2;
+       char *htag = ha->htag;
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t now = gethrestime_sec();
        uint64_t zapobj;
 
        mutex_enter(&ds->ds_lock);
@@ -3460,48 +3489,43 @@ dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
        VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
 
-       spa_history_internal_log(LOG_DS_USER_HOLD,
-           ds->ds_dir->dd_pool->dp_spa, tx, cr, "<%s> dataset = %llu",
-           htag, ds->ds_object);
-}
+       if (ha->temphold) {
+               VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
+                   htag, &now, tx));
+       }
 
-struct dsl_ds_holdarg {
-       dsl_sync_task_group_t *dstg;
-       char *htag;
-       char *snapname;
-       boolean_t recursive;
-       char failed[MAXPATHLEN];
-};
+       spa_history_log_internal(LOG_DS_USER_HOLD,
+           dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
+           (int)ha->temphold, ds->ds_object);
+}
 
 static int
-dsl_dataset_user_hold_one(char *dsname, void *arg)
+dsl_dataset_user_hold_one(const char *dsname, void *arg)
 {
        struct dsl_ds_holdarg *ha = arg;
        dsl_dataset_t *ds;
        int error;
        char *name;
-       size_t buflen;
 
        /* alloc a buffer to hold dsname@snapname plus terminating NULL */
-       buflen = strlen(dsname) + strlen(ha->snapname) + 2;
-       name = kmem_alloc(buflen, KM_SLEEP);
-       (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+       name = kmem_asprintf("%s@%s", dsname, ha->snapname);
        error = dsl_dataset_hold(name, ha->dstg, &ds);
-       kmem_free(name, buflen);
+       strfree(name);
        if (error == 0) {
+               ha->gotone = B_TRUE;
                dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
-                   dsl_dataset_user_hold_sync, ds, ha->htag, 0);
+                   dsl_dataset_user_hold_sync, ds, ha, 0);
        } else if (error == ENOENT && ha->recursive) {
                error = 0;
        } else {
-               (void) strcpy(ha->failed, dsname);
+               (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
        }
        return (error);
 }
 
 int
 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive)
+    boolean_t recursive, boolean_t temphold)
 {
        struct dsl_ds_holdarg *ha;
        dsl_sync_task_t *dst;
@@ -3522,6 +3546,7 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
        ha->htag = htag;
        ha->snapname = snapname;
        ha->recursive = recursive;
+       ha->temphold = temphold;
        if (recursive) {
                error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
                    ha, DS_FIND_CHILDREN);
@@ -3542,8 +3567,11 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
                dsl_dataset_rele(ds, ha->dstg);
        }
 
+       if (error == 0 && recursive && !ha->gotone)
+               error = ENOENT;
+
        if (error)
-               (void) strcpy(dsname, ha->failed);
+               (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
 
        dsl_sync_task_group_destroy(ha->dstg);
        kmem_free(ha, sizeof (struct dsl_ds_holdarg));
@@ -3618,10 +3646,6 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
                         */
                        if (!ra->own)
                                return (EBUSY);
-                       if (ds->ds_user_ptr) {
-                               ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-                               ds->ds_user_ptr = NULL;
-                       }
                }
                dsda.ds = ds;
                dsda.releasing = B_TRUE;
@@ -3632,20 +3656,28 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
 }
 
 static void
-dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
        struct dsl_ds_releasearg *ra = arg1;
        dsl_dataset_t *ds = ra->ds;
-       spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       objset_t *mos = dp->dp_meta_objset;
        uint64_t zapobj;
        uint64_t dsobj = ds->ds_object;
        uint64_t refs;
+       int error;
+
+       if (ds->ds_objset) {
+               dmu_objset_evict(ds->ds_objset);
+               ds->ds_objset = NULL;
+       }
 
        mutex_enter(&ds->ds_lock);
        ds->ds_userrefs--;
        refs = ds->ds_userrefs;
        mutex_exit(&ds->ds_lock);
+       error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
+       VERIFY(error == 0 || error == ENOENT);
        zapobj = ds->ds_phys->ds_userrefs_obj;
        VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
        if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
@@ -3656,16 +3688,16 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                dsda.ds = ds;
                dsda.releasing = B_TRUE;
                /* We already did the destroy_check */
-               dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
+               dsl_dataset_destroy_sync(&dsda, tag, tx);
        }
 
-       spa_history_internal_log(LOG_DS_USER_RELEASE,
-           spa, tx, cr, "<%s> %lld dataset = %llu",
+       spa_history_log_internal(LOG_DS_USER_RELEASE,
+           dp->dp_spa, tx, "<%s> %lld dataset = %llu",
            ra->htag, (longlong_t)refs, dsobj);
 }
 
 static int
-dsl_dataset_user_release_one(char *dsname, void *arg)
+dsl_dataset_user_release_one(const char *dsname, void *arg)
 {
        struct dsl_ds_holdarg *ha = arg;
        struct dsl_ds_releasearg *ra;
@@ -3673,25 +3705,21 @@ dsl_dataset_user_release_one(char *dsname, void *arg)
        int error;
        void *dtag = ha->dstg;
        char *name;
-       size_t buflen;
        boolean_t own = B_FALSE;
        boolean_t might_destroy;
 
-       if (strlen(ha->htag) >= ZAP_MAXNAMELEN)
-               return (ENAMETOOLONG);
-
        /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
-       buflen = strlen(dsname) + strlen(ha->snapname) + 2;
-       name = kmem_alloc(buflen, KM_SLEEP);
-       (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+       name = kmem_asprintf("%s@%s", dsname, ha->snapname);
        error = dsl_dataset_hold(name, dtag, &ds);
-       kmem_free(name, buflen);
+       strfree(name);
        if (error == ENOENT && ha->recursive)
                return (0);
-       (void) strcpy(ha->failed, dsname);
+       (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
        if (error)
                return (error);
 
+       ha->gotone = B_TRUE;
+
        ASSERT(dsl_dataset_is_snapshot(ds));
 
        error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
@@ -3702,19 +3730,15 @@ dsl_dataset_user_release_one(char *dsname, void *arg)
 
        if (might_destroy) {
 #ifdef _KERNEL
+               name = kmem_asprintf("%s@%s", dsname, ha->snapname);
                error = zfs_unmount_snap(name, NULL);
-               if (error) {
-                       dsl_dataset_rele(ds, dtag);
-                       return (error);
-               }
-               error = dsl_dataset_zvol_cleanup(ds, name);
+               strfree(name);
                if (error) {
                        dsl_dataset_rele(ds, dtag);
                        return (error);
                }
 #endif
-               if (!dsl_dataset_tryown(ds,
-                   DS_MODE_READONLY | DS_MODE_INCONSISTENT, dtag)) {
+               if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
                        dsl_dataset_rele(ds, dtag);
                        return (EBUSY);
                } else {
@@ -3742,6 +3766,7 @@ dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
        spa_t *spa;
        int error;
 
+top:
        ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
 
        (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
@@ -3781,15 +3806,58 @@ dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
                kmem_free(ra, sizeof (struct dsl_ds_releasearg));
        }
 
-       if (error)
-               (void) strcpy(dsname, ha->failed);
+       if (error == 0 && recursive && !ha->gotone)
+               error = ENOENT;
+
+       if (error && error != EBUSY)
+               (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
 
        dsl_sync_task_group_destroy(ha->dstg);
        kmem_free(ha, sizeof (struct dsl_ds_holdarg));
        spa_close(spa, FTAG);
+
+       /*
+        * We can get EBUSY if we were racing with deferred destroy and
+        * dsl_dataset_user_release_check() hadn't done the necessary
+        * open context setup.  We can also get EBUSY if we're racing
+        * with destroy and that thread is the ds_owner.  Either way
+        * the busy condition should be transient, and we should retry
+        * the release operation.
+        */
+       if (error == EBUSY)
+               goto top;
+
        return (error);
 }
 
+/*
+ * Called at spa_load time to release a stale temporary user hold.
+ */
+int
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
+{
+       dsl_dataset_t *ds;
+       char *snap;
+       char *name;
+       int namelen;
+       int error;
+
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       rw_exit(&dp->dp_config_rwlock);
+       if (error)
+               return (error);
+       namelen = dsl_dataset_namelen(ds)+1;
+       name = kmem_alloc(namelen, KM_SLEEP);
+       dsl_dataset_name(ds, name);
+       dsl_dataset_rele(ds, FTAG);
+
+       snap = strchr(name, '@');
+       *snap = '\0';
+       ++snap;
+       return (dsl_dataset_user_release(name, snap, htag, B_FALSE));
+}
+
 int
 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
 {
@@ -3819,3 +3887,24 @@ dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
        dsl_dataset_rele(ds, FTAG);
        return (0);
 }
+
+/*
+ * Note, this fuction is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+       dsl_dataset_t *ds;
+
+       if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
+               if (DS_IS_INCONSISTENT(ds))
+                       (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
+               else
+                       dsl_dataset_disown(ds, FTAG);
+       }
+       return (0);
+}
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
new file mode 100644 (file)
index 0000000..064f8ac
--- /dev/null
@@ -0,0 +1,474 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+       const dsl_deadlist_entry_t *dle1 = arg1;
+       const dsl_deadlist_entry_t *dle2 = arg2;
+
+       if (dle1->dle_mintxg < dle2->dle_mintxg)
+               return (-1);
+       else if (dle1->dle_mintxg > dle2->dle_mintxg)
+               return (+1);
+       else
+               return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       ASSERT(!dl->dl_oldfmt);
+       if (dl->dl_havetree)
+               return;
+
+       avl_create(&dl->dl_tree, dsl_deadlist_compare,
+           sizeof (dsl_deadlist_entry_t),
+           offsetof(dsl_deadlist_entry_t, dle_node));
+       for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc)) {
+               dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+               dle->dle_mintxg = strtonum(za.za_name, NULL);
+               VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+                   za.za_first_integer));
+               avl_add(&dl->dl_tree, dle);
+       }
+       zap_cursor_fini(&zc);
+       dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+       dmu_object_info_t doi;
+
+       mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+       dl->dl_os = os;
+       dl->dl_object = object;
+       VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+       dmu_object_info_from_db(dl->dl_dbuf, &doi);
+       if (doi.doi_type == DMU_OT_BPOBJ) {
+               dmu_buf_rele(dl->dl_dbuf, dl);
+               dl->dl_dbuf = NULL;
+               dl->dl_oldfmt = B_TRUE;
+               VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+               return;
+       }
+
+       dl->dl_oldfmt = B_FALSE;
+       dl->dl_phys = dl->dl_dbuf->db_data;
+       dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+       void *cookie = NULL;
+       dsl_deadlist_entry_t *dle;
+
+       if (dl->dl_oldfmt) {
+               dl->dl_oldfmt = B_FALSE;
+               bpobj_close(&dl->dl_bpobj);
+               return;
+       }
+
+       if (dl->dl_havetree) {
+               while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+                   != NULL) {
+                       bpobj_close(&dle->dle_bpobj);
+                       kmem_free(dle, sizeof (*dle));
+               }
+               avl_destroy(&dl->dl_tree);
+       }
+       dmu_buf_rele(dl->dl_dbuf, dl);
+       mutex_destroy(&dl->dl_lock);
+       dl->dl_dbuf = NULL;
+       dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+       if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+               return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+       return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+           sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+       dmu_object_info_t doi;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+       if (doi.doi_type == DMU_OT_BPOBJ) {
+               bpobj_free(os, dlobj, tx);
+               return;
+       }
+
+       for (zap_cursor_init(&zc, os, dlobj);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc))
+               bpobj_free(os, za.za_first_integer, tx);
+       zap_cursor_fini(&zc);
+       VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       dsl_deadlist_entry_t dle_tofind;
+       dsl_deadlist_entry_t *dle;
+       avl_index_t where;
+
+       if (dl->dl_oldfmt) {
+               bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+               return;
+       }
+
+       dsl_deadlist_load_tree(dl);
+
+       dmu_buf_will_dirty(dl->dl_dbuf, tx);
+       mutex_enter(&dl->dl_lock);
+       dl->dl_phys->dl_used +=
+           bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+       dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+       dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+       mutex_exit(&dl->dl_lock);
+
+       dle_tofind.dle_mintxg = bp->blk_birth;
+       dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+       if (dle == NULL)
+               dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+       else
+               dle = AVL_PREV(&dl->dl_tree, dle);
+       bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+       uint64_t obj;
+       dsl_deadlist_entry_t *dle;
+
+       if (dl->dl_oldfmt)
+               return;
+
+       dsl_deadlist_load_tree(dl);
+
+       dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+       dle->dle_mintxg = mintxg;
+       obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+       VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+       avl_add(&dl->dl_tree, dle);
+
+       VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+           mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+       dsl_deadlist_entry_t dle_tofind;
+       dsl_deadlist_entry_t *dle, *dle_prev;
+
+       if (dl->dl_oldfmt)
+               return;
+
+       dsl_deadlist_load_tree(dl);
+
+       dle_tofind.dle_mintxg = mintxg;
+       dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+       dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+       bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
+           dle->dle_bpobj.bpo_object, tx);
+
+       avl_remove(&dl->dl_tree, dle);
+       bpobj_close(&dle->dle_bpobj);
+       kmem_free(dle, sizeof (*dle));
+
+       VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+       dsl_deadlist_t dl;
+       dsl_pool_t *dp = dmu_objset_pool(os);
+
+       dsl_deadlist_open(&dl, os, dlobj);
+       if (dl.dl_oldfmt) {
+               dsl_deadlist_close(&dl);
+               return;
+       }
+
+       while (mrs_obj != 0) {
+               dsl_dataset_t *ds;
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+               dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
+               mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+               dsl_dataset_rele(ds, FTAG);
+       }
+       dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+       dsl_deadlist_entry_t *dle;
+       uint64_t newobj;
+
+       newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+       if (dl->dl_oldfmt) {
+               dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+               return (newobj);
+       }
+
+       dsl_deadlist_load_tree(dl);
+
+       for (dle = avl_first(&dl->dl_tree); dle;
+           dle = AVL_NEXT(&dl->dl_tree, dle)) {
+               uint64_t obj;
+
+               if (dle->dle_mintxg >= maxtxg)
+                       break;
+
+               obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+                   dle->dle_mintxg, obj, tx));
+       }
+       return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+       if (dl->dl_oldfmt) {
+               VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+                   usedp, compp, uncompp));
+               return;
+       }
+
+       mutex_enter(&dl->dl_lock);
+       *usedp = dl->dl_phys->dl_used;
+       *compp = dl->dl_phys->dl_comp;
+       *uncompp = dl->dl_phys->dl_uncomp;
+       mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+       dsl_deadlist_entry_t dle_tofind;
+       dsl_deadlist_entry_t *dle;
+       avl_index_t where;
+
+       if (dl->dl_oldfmt) {
+               VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+                   mintxg, maxtxg, usedp, compp, uncompp));
+               return;
+       }
+
+       dsl_deadlist_load_tree(dl);
+       *usedp = *compp = *uncompp = 0;
+
+       dle_tofind.dle_mintxg = mintxg;
+       dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+       /*
+        * If we don't find this mintxg, there shouldn't be anything
+        * after it either.
+        */
+       ASSERT(dle != NULL ||
+           avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+       for (; dle && dle->dle_mintxg < maxtxg;
+           dle = AVL_NEXT(&dl->dl_tree, dle)) {
+               uint64_t used, comp, uncomp;
+
+               VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+                   &used, &comp, &uncomp));
+
+               *usedp += used;
+               *compp += comp;
+               *uncompp += uncomp;
+       }
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+    dmu_tx_t *tx)
+{
+       dsl_deadlist_entry_t dle_tofind;
+       dsl_deadlist_entry_t *dle;
+       avl_index_t where;
+       uint64_t used, comp, uncomp;
+       bpobj_t bpo;
+
+       VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+       VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+       bpobj_close(&bpo);
+
+       dsl_deadlist_load_tree(dl);
+
+       dmu_buf_will_dirty(dl->dl_dbuf, tx);
+       mutex_enter(&dl->dl_lock);
+       dl->dl_phys->dl_used += used;
+       dl->dl_phys->dl_comp += comp;
+       dl->dl_phys->dl_uncomp += uncomp;
+       mutex_exit(&dl->dl_lock);
+
+       dle_tofind.dle_mintxg = birth;
+       dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+       if (dle == NULL)
+               dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+       bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       dsl_deadlist_t *dl = arg;
+       dsl_deadlist_insert(dl, bp, tx);
+       return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       dmu_buf_t *bonus;
+       dsl_deadlist_phys_t *dlp;
+       dmu_object_info_t doi;
+
+       VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+       if (doi.doi_type == DMU_OT_BPOBJ) {
+               bpobj_t bpo;
+               VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+               VERIFY3U(0, ==, bpobj_iterate(&bpo,
+                   dsl_deadlist_insert_cb, dl, tx));
+               bpobj_close(&bpo);
+               return;
+       }
+
+       for (zap_cursor_init(&zc, dl->dl_os, obj);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc)) {
+               uint64_t mintxg = strtonum(za.za_name, NULL);
+               dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+               VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+       }
+       zap_cursor_fini(&zc);
+
+       VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+       dlp = bonus->db_data;
+       dmu_buf_will_dirty(bonus, tx);
+       bzero(dlp, sizeof (*dlp));
+       dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx)
+{
+       dsl_deadlist_entry_t dle_tofind;
+       dsl_deadlist_entry_t *dle;
+       avl_index_t where;
+
+       ASSERT(!dl->dl_oldfmt);
+       dmu_buf_will_dirty(dl->dl_dbuf, tx);
+       dsl_deadlist_load_tree(dl);
+
+       dle_tofind.dle_mintxg = mintxg;
+       dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+       if (dle == NULL)
+               dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+       while (dle) {
+               uint64_t used, comp, uncomp;
+               dsl_deadlist_entry_t *dle_next;
+
+               bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+               VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+                   &used, &comp, &uncomp));
+               mutex_enter(&dl->dl_lock);
+               ASSERT3U(dl->dl_phys->dl_used, >=, used);
+               ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+               ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+               dl->dl_phys->dl_used -= used;
+               dl->dl_phys->dl_comp -= comp;
+               dl->dl_phys->dl_uncomp -= uncomp;
+               mutex_exit(&dl->dl_lock);
+
+               VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+                   dle->dle_mintxg, tx));
+
+               dle_next = AVL_NEXT(&dl->dl_tree, dle);
+               avl_remove(&dl->dl_tree, dle);
+               bpobj_close(&dle->dle_bpobj);
+               kmem_free(dle, sizeof (*dle));
+               dle = dle_next;
+       }
+}
index 5d76ff5..85490c8 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -75,8 +74,6 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/cred.h>
@@ -150,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
 }
 
 static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
        nvlist_t *nvp = arg2;
@@ -185,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
                        VERIFY(zap_update(mos, jumpobj,
                            perm, 8, 1, &n, tx) == 0);
-                       spa_history_internal_log(LOG_DS_PERM_UPDATE,
-                           dd->dd_pool->dp_spa, tx, cr,
+                       spa_history_log_internal(LOG_DS_PERM_UPDATE,
+                           dd->dd_pool->dp_spa, tx,
                            "%s %s dataset = %llu", whokey, perm,
                            dd->dd_phys->dd_head_dataset_obj);
                }
@@ -194,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
        nvlist_t *nvp = arg2;
@@ -217,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                                (void) zap_remove(mos, zapobj, whokey, tx);
                                VERIFY(0 == zap_destroy(mos, jumpobj, tx));
                        }
-                       spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
-                           dd->dd_pool->dp_spa, tx, cr,
+                       spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE,
+                           dd->dd_pool->dp_spa, tx,
                            "%s dataset = %llu", whokey,
                            dd->dd_phys->dd_head_dataset_obj);
                        continue;
@@ -238,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                                VERIFY(0 == zap_destroy(mos,
                                    jumpobj, tx));
                        }
-                       spa_history_internal_log(LOG_DS_PERM_REMOVE,
-                           dd->dd_pool->dp_spa, tx, cr,
+                       spa_history_log_internal(LOG_DS_PERM_REMOVE,
+                           dd->dd_pool->dp_spa, tx,
                            "%s %s dataset = %llu", whokey, perm,
                            dd->dd_phys->dd_head_dataset_obj);
                }
@@ -589,7 +586,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 
                        if (dsl_prop_get_dd(dd,
                            zfs_prop_to_name(ZFS_PROP_ZONED),
-                           8, 1, &zoned, NULL) != 0)
+                           8, 1, &zoned, NULL, B_FALSE) != 0)
                                break;
                        if (!zoned)
                                break;
@@ -739,5 +736,5 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
 boolean_t
 dsl_delegation_on(objset_t *os)
 {
-       return (os->os->os_spa->spa_delegation);
+       return (!!spa_delegation(os->os_spa));
 }
index 2f312ae..1cd49c8 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -32,6 +31,7 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
+#include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
@@ -39,8 +39,7 @@
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
 
 
 /* ARGSUSED */
@@ -63,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
        spa_close(dd->dd_pool->dp_spa, dd);
 
        /*
-        * The props callback list should be empty since they hold the
-        * dir open.
+        * The props callback list should have been cleaned up by
+        * objset_evict().
         */
        list_destroy(&dd->dd_prop_cbs);
        mutex_destroy(&dd->dd_lock);
@@ -107,6 +106,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
                list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
                    offsetof(dsl_prop_cb_record_t, cbr_node));
 
+               dsl_dir_snap_cmtime_update(dd);
+
                if (dd->dd_phys->dd_parent_obj) {
                        err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
                            NULL, dd, &dd->dd_parent);
@@ -133,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
                        (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
                }
 
+               if (dsl_dir_is_clone(dd)) {
+                       dmu_buf_t *origin_bonus;
+                       dsl_dataset_phys_t *origin_phys;
+
+                       /*
+                        * We can't open the origin dataset, because
+                        * that would require opening this dsl_dir.
+                        * Just look at its phys directly instead.
+                        */
+                       err = dmu_bonus_hold(dp->dp_meta_objset,
+                           dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+                       if (err)
+                               goto errout;
+                       origin_phys = origin_bonus->db_data;
+                       dd->dd_origin_txg =
+                           origin_phys->ds_creation_txg;
+                       dmu_buf_rele(origin_bonus, FTAG);
+               }
+
                winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
                    dsl_dir_evict);
                if (winner) {
@@ -392,7 +412,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 {
        objset_t *mos = dp->dp_meta_objset;
        uint64_t ddobj;
-       dsl_dir_phys_t *dsphys;
+       dsl_dir_phys_t *ddphys;
        dmu_buf_t *dbuf;
 
        ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
@@ -407,17 +427,17 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
        }
        VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
        dmu_buf_will_dirty(dbuf, tx);
-       dsphys = dbuf->db_data;
+       ddphys = dbuf->db_data;
 
-       dsphys->dd_creation_time = gethrestime_sec();
+       ddphys->dd_creation_time = gethrestime_sec();
        if (pds)
-               dsphys->dd_parent_obj = pds->dd_object;
-       dsphys->dd_props_zapobj = zap_create(mos,
+               ddphys->dd_parent_obj = pds->dd_object;
+       ddphys->dd_props_zapobj = zap_create(mos,
            DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-       dsphys->dd_child_dir_zapobj = zap_create(mos,
+       ddphys->dd_child_dir_zapobj = zap_create(mos,
            DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
        if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
-               dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+               ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
        dmu_buf_rele(dbuf, FTAG);
 
        return (ddobj);
@@ -427,7 +447,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 int
 dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
+       dsl_dataset_t *ds = arg1;
+       dsl_dir_t *dd = ds->ds_dir;
        dsl_pool_t *dp = dd->dd_pool;
        objset_t *mos = dp->dp_meta_objset;
        int err;
@@ -454,19 +475,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
+       dsl_dataset_t *ds = arg1;
+       dsl_dir_t *dd = ds->ds_dir;
        objset_t *mos = dd->dd_pool->dp_meta_objset;
-       uint64_t val, obj;
+       dsl_prop_setarg_t psa;
+       uint64_t value = 0;
+       uint64_t obj;
        dd_used_t t;
 
        ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
        ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
        /* Remove our reservation. */
-       val = 0;
-       dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+       dsl_prop_setarg_init_uint64(&psa, "reservation",
+           (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+           &value);
+       psa.psa_effective_value = 0;    /* predict default value */
+
+       dsl_dir_set_reservation_sync(ds, &psa, tx);
+
        ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
        ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
        for (t = 0; t < DD_USED_NUM; t++)
@@ -640,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
        if (used > quota) {
                /* over quota */
                myspace = 0;
-
-               /*
-                * While it's OK to be a little over quota, if
-                * we think we are using more space than there
-                * is in the pool (which is already 1.6% more than
-                * dsl_pool_adjustedsize()), something is very
-                * wrong.
-                */
-               ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
        } else {
                /*
                 * the lesser of the space provided by our parent and
@@ -676,8 +696,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 {
        uint64_t txg = tx->tx_txg;
        uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+       uint64_t deferred = 0;
        struct tempreserve *tr;
-       int enospc = EDQUOT;
+       int retval = EDQUOT;
        int txgidx = txg & TXG_MASK;
        int i;
        uint64_t ref_rsrv = 0;
@@ -703,7 +724,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
         */
        if (first && tx->tx_objset) {
                int error;
-               dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+               dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 
                error = dsl_dataset_check_quota(ds, checkrefquota,
                    asize, est_inflight, &used_on_disk, &ref_rsrv);
@@ -723,7 +744,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
                quota = dd->dd_phys->dd_quota;
 
        /*
-        * Adjust the quota against the actual pool size at the root.
+        * Adjust the quota against the actual pool size at the root
+        * minus any outstanding deferred frees.
         * To ensure that it's possible to remove files from a full
         * pool without inducing transient overcommits, we throttle
         * netfree transactions against a quota that is slightly larger,
@@ -732,10 +754,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
         * removes to get through.
         */
        if (dd->dd_parent == NULL) {
+               spa_t *spa = dd->dd_pool->dp_spa;
                uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-               if (poolsize < quota) {
-                       quota = poolsize;
-                       enospc = ENOSPC;
+               deferred = metaslab_class_get_deferred(spa_normal_class(spa));
+               if (poolsize - deferred < quota) {
+                       quota = poolsize - deferred;
+                       retval = ENOSPC;
                }
        }
 
@@ -745,15 +769,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
         * on-disk is over quota and there are no pending changes (which
         * may free up space for us).
         */
-       if (used_on_disk + est_inflight > quota) {
-               if (est_inflight > 0 || used_on_disk < quota)
-                       enospc = ERESTART;
+       if (used_on_disk + est_inflight >= quota) {
+               if (est_inflight > 0 || used_on_disk < quota ||
+                   (retval == ENOSPC && used_on_disk < quota + deferred))
+                       retval = ERESTART;
                dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
                    "quota=%lluK tr=%lluK err=%d\n",
                    used_on_disk>>10, est_inflight>>10,
-                   quota>>10, asize>>10, enospc);
+                   quota>>10, asize>>10, retval);
                mutex_exit(&dd->dd_lock);
-               return (enospc);
+               return (retval);
        }
 
        /* We need to up our estimated delta before dropping dd_lock */
@@ -987,13 +1012,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 static int
 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       uint64_t *quotap = arg2;
-       uint64_t new_quota = *quotap;
-       int err = 0;
+       dsl_dataset_t *ds = arg1;
+       dsl_dir_t *dd = ds->ds_dir;
+       dsl_prop_setarg_t *psa = arg2;
+       int err;
        uint64_t towrite;
 
-       if (new_quota == 0)
+       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+               return (err);
+
+       if (psa->psa_effective_value == 0)
                return (0);
 
        mutex_enter(&dd->dd_lock);
@@ -1005,64 +1033,88 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
         */
        towrite = dsl_dir_space_towrite(dd);
        if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-           (new_quota < dd->dd_phys->dd_reserved ||
-           new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
+           (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
+           psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
                err = ENOSPC;
        }
        mutex_exit(&dd->dd_lock);
        return (err);
 }
 
-/* ARGSUSED */
+extern dsl_syncfunc_t dsl_prop_set_sync;
+
 static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       uint64_t *quotap = arg2;
-       uint64_t new_quota = *quotap;
+       dsl_dataset_t *ds = arg1;
+       dsl_dir_t *dd = ds->ds_dir;
+       dsl_prop_setarg_t *psa = arg2;
+       uint64_t effective_value = psa->psa_effective_value;
+
+       dsl_prop_set_sync(ds, psa, tx);
+       DSL_PROP_CHECK_PREDICTION(dd, psa);
 
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
        mutex_enter(&dd->dd_lock);
-       dd->dd_phys->dd_quota = new_quota;
+       dd->dd_phys->dd_quota = effective_value;
        mutex_exit(&dd->dd_lock);
 
-       spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
-           tx, cr, "%lld dataset = %llu ",
-           (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
+       spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+           tx, "%lld dataset = %llu ",
+           (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
        dsl_dir_t *dd;
+       dsl_dataset_t *ds;
+       dsl_prop_setarg_t psa;
        int err;
 
-       err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+       dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
+
+       err = dsl_dataset_hold(ddname, FTAG, &ds);
        if (err)
                return (err);
 
-       if (quota != dd->dd_phys->dd_quota) {
-               /*
-                * If someone removes a file, then tries to set the quota, we
-                * want to make sure the file freeing takes effect.
-                */
-               txg_wait_open(dd->dd_pool, 0);
-
-               err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-                   dsl_dir_set_quota_sync, dd, &quota, 0);
+       err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+       if (err) {
+               dsl_dataset_rele(ds, FTAG);
+               return (err);
        }
+
+       ASSERT(ds->ds_dir == dd);
+
+       /*
+        * If someone removes a file, then tries to set the quota, we want to
+        * make sure the file freeing takes effect.
+        */
+       txg_wait_open(dd->dd_pool, 0);
+
+       err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+           dsl_dir_set_quota_sync, ds, &psa, 0);
+
        dsl_dir_close(dd, FTAG);
+       dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
 int
 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       uint64_t *reservationp = arg2;
-       uint64_t new_reservation = *reservationp;
+       dsl_dataset_t *ds = arg1;
+       dsl_dir_t *dd = ds->ds_dir;
+       dsl_prop_setarg_t *psa = arg2;
+       uint64_t effective_value;
        uint64_t used, avail;
+       int err;
+
+       if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+               return (err);
+
+       effective_value = psa->psa_effective_value;
 
        /*
         * If we are doing the preliminary check in open context, the
@@ -1082,37 +1134,40 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
                avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
        }
 
-       if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) {
-               uint64_t delta = MAX(used, new_reservation) -
+       if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
+               uint64_t delta = MAX(used, effective_value) -
                    MAX(used, dd->dd_phys->dd_reserved);
 
                if (delta > avail)
                        return (ENOSPC);
                if (dd->dd_phys->dd_quota > 0 &&
-                   new_reservation > dd->dd_phys->dd_quota)
+                   effective_value > dd->dd_phys->dd_quota)
                        return (ENOSPC);
        }
 
        return (0);
 }
 
-/* ARGSUSED */
 static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-       dsl_dir_t *dd = arg1;
-       uint64_t *reservationp = arg2;
-       uint64_t new_reservation = *reservationp;
+       dsl_dataset_t *ds = arg1;
+       dsl_dir_t *dd = ds->ds_dir;
+       dsl_prop_setarg_t *psa = arg2;
+       uint64_t effective_value = psa->psa_effective_value;
        uint64_t used;
        int64_t delta;
 
+       dsl_prop_set_sync(ds, psa, tx);
+       DSL_PROP_CHECK_PREDICTION(dd, psa);
+
        dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
        mutex_enter(&dd->dd_lock);
        used = dd->dd_phys->dd_used_bytes;
-       delta = MAX(used, new_reservation) -
+       delta = MAX(used, effective_value) -
            MAX(used, dd->dd_phys->dd_reserved);
-       dd->dd_phys->dd_reserved = new_reservation;
+       dd->dd_phys->dd_reserved = effective_value;
 
        if (dd->dd_parent != NULL) {
                /* Roll up this additional usage into our ancestors */
@@ -1121,23 +1176,39 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        }
        mutex_exit(&dd->dd_lock);
 
-       spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
-           tx, cr, "%lld dataset = %llu",
-           (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
+       spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+           tx, "%lld dataset = %llu",
+           (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation)
 {
        dsl_dir_t *dd;
+       dsl_dataset_t *ds;
+       dsl_prop_setarg_t psa;
        int err;
 
-       err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+       dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
+
+       err = dsl_dataset_hold(ddname, FTAG, &ds);
        if (err)
                return (err);
+
+       err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+       if (err) {
+               dsl_dataset_rele(ds, FTAG);
+               return (err);
+       }
+
+       ASSERT(ds->ds_dir == dd);
+
        err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
-           dsl_dir_set_reservation_sync, dd, &reservation, 0);
+           dsl_dir_set_reservation_sync, ds, &psa, 0);
+
        dsl_dir_close(dd, FTAG);
+       dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
@@ -1175,7 +1246,6 @@ struct renamearg {
        const char *mynewname;
 };
 
-/*ARGSUSED*/
 static int
 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -1186,8 +1256,14 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
        int err;
        uint64_t val;
 
-       /* There should be 2 references: the open and the dirty */
-       if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+       /*
+        * There should only be one reference, from dmu_objset_rename().
+        * Fleeting holds are also possible (eg, from "zfs list" getting
+        * stats), but any that are present in open context will likely
+        * be gone by syncing context, so only fail from syncing
+        * context.
+        */
+       if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
                return (EBUSY);
 
        /* check for existing name */
@@ -1216,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
        struct renamearg *ra = arg2;
@@ -1265,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
            dd->dd_myname, 8, 1, &dd->dd_object, tx);
        ASSERT3U(err, ==, 0);
 
-       spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
-           tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+       spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+           tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
@@ -1315,3 +1391,26 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
 
        return (0);
 }
+
+timestruc_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+       timestruc_t t;
+
+       mutex_enter(&dd->dd_lock);
+       t = dd->dd_snap_cmtime;
+       mutex_exit(&dd->dd_lock);
+
+       return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+       timestruc_t t;
+
+       gethrestime(&t);
+       mutex_enter(&dd->dd_lock);
+       dd->dd_snap_cmtime = t;
+       mutex_exit(&dd->dd_lock);
+}
index 2c5dfca..2cd21a1 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;                 /* 1/8th of physical memory */
-int zfs_txg_synctime = 5;                      /* target secs to sync a txg */
+int zfs_txg_synctime_ms = 5000;                /* target millisecs to sync a txg */
 
 uint64_t zfs_write_limit_min = 32 << 20;       /* min write limit is 32MB */
 uint64_t zfs_write_limit_max = 0;              /* max data payload per txg */
@@ -50,7 +53,7 @@ kmutex_t zfs_write_limit_lock;
 
 static pgcnt_t old_physmem = 0;
 
-static int
+int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
        uint64_t obj;
@@ -88,7 +91,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
            offsetof(dsl_dataset_t, ds_synced_link));
 
        mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
 
        dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
            1, 4, 0);
@@ -103,13 +105,13 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
        dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
        dsl_dir_t *dd;
        dsl_dataset_t *ds;
-       objset_impl_t *osi;
+       uint64_t obj;
 
        rw_enter(&dp->dp_config_rwlock, RW_WRITER);
-       err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+       err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+           &dp->dp_meta_objset);
        if (err)
                goto out;
-       dp->dp_meta_objset = &osi->os;
 
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
@@ -143,53 +145,30 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
                        goto out;
        }
 
-       /* get scrub status */
-       err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-           &dp->dp_scrub_func);
-       if (err == 0) {
-               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-                   DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-                   &dp->dp_scrub_queue_obj);
-               if (err)
-                       goto out;
-               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-                   DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-                   &dp->dp_scrub_min_txg);
-               if (err)
-                       goto out;
-               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-                   DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-                   &dp->dp_scrub_max_txg);
-               if (err)
-                       goto out;
-               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-                   DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
-                   &dp->dp_scrub_bookmark);
+       if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+               err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+                   &dp->dp_free_dir);
                if (err)
                        goto out;
+
                err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-                   DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-                   &spa->spa_scrub_errors);
+                   DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
                if (err)
                        goto out;
-               if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-                       /*
-                        * A new-type scrub was in progress on an old
-                        * pool.  Restart from the beginning, since the
-                        * old software may have changed the pool in the
-                        * meantime.
-                        */
-                       dsl_pool_scrub_restart(dp);
-               }
-       } else {
-               /*
-                * It's OK if there is no scrub in progress (and if
-                * there was an I/O error, ignore it).
-                */
-               err = 0;
+               VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+                   dp->dp_meta_objset, obj));
        }
 
+       err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+           &dp->dp_tmp_userrefs_obj);
+       if (err == ENOENT)
+               err = 0;
+       if (err)
+               goto out;
+
+       err = dsl_scan_init(dp, txg);
+
 out:
        rw_exit(&dp->dp_config_rwlock);
        if (err)
@@ -214,22 +193,27 @@ dsl_pool_close(dsl_pool_t *dp)
                dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
        if (dp->dp_mos_dir)
                dsl_dir_close(dp->dp_mos_dir, dp);
+       if (dp->dp_free_dir)
+               dsl_dir_close(dp->dp_free_dir, dp);
        if (dp->dp_root_dir)
                dsl_dir_close(dp->dp_root_dir, dp);
 
+       bpobj_close(&dp->dp_free_bpobj);
+
        /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
        if (dp->dp_meta_objset)
-               dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+               dmu_objset_evict(dp->dp_meta_objset);
 
        txg_list_destroy(&dp->dp_dirty_datasets);
+       txg_list_destroy(&dp->dp_sync_tasks);
        txg_list_destroy(&dp->dp_dirty_dirs);
        list_destroy(&dp->dp_synced_datasets);
 
        arc_flush(dp->dp_spa);
        txg_fini(dp);
+       dsl_scan_fini(dp);
        rw_destroy(&dp->dp_config_rwlock);
        mutex_destroy(&dp->dp_lock);
-       mutex_destroy(&dp->dp_scrub_cancel_lock);
        taskq_destroy(dp->dp_vnrele_taskq);
        if (dp->dp_blkstats)
                kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@@ -242,19 +226,22 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        int err;
        dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
        dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
-       objset_impl_t *osip;
+       objset_t *os;
        dsl_dataset_t *ds;
-       uint64_t dsobj;
+       uint64_t obj;
 
        /* create and open the MOS (meta-objset) */
-       dp->dp_meta_objset = &dmu_objset_create_impl(spa,
-           NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+       dp->dp_meta_objset = dmu_objset_create_impl(spa,
+           NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 
        /* create the pool directory */
        err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
        ASSERT3U(err, ==, 0);
 
+       /* Initialize scan structures */
+       VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+
        /* create and open the root dir */
        dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
        VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
@@ -265,18 +252,33 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        VERIFY(0 == dsl_pool_open_special_dir(dp,
            MOS_DIR_NAME, &dp->dp_mos_dir));
 
+       if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+               /* create and open the free dir */
+               (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+                   FREE_DIR_NAME, tx);
+               VERIFY(0 == dsl_pool_open_special_dir(dp,
+                   FREE_DIR_NAME, &dp->dp_free_dir));
+
+               /* create and open the free_bplist */
+               obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+               VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+               VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+                   dp->dp_meta_objset, obj));
+       }
+
        if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
                dsl_pool_create_origin(dp, tx);
 
        /* create the root dataset */
-       dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+       obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
        /* create the root objset */
-       VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-       osip = dmu_objset_create_impl(dp->dp_spa, ds,
+       VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+       os = dmu_objset_create_impl(dp->dp_spa, ds,
            dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 #ifdef _KERNEL
-       zfs_create_fs(&osip->os, kcred, zplprops, tx);
+       zfs_create_fs(os, kcred, zplprops, tx);
 #endif
        dsl_dataset_rele(ds, FTAG);
 
@@ -285,6 +287,14 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        return (dp);
 }
 
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       dsl_deadlist_t *dl = arg;
+       dsl_deadlist_insert(dl, bp, tx);
+       return (0);
+}
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@@ -293,11 +303,19 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
        dsl_dir_t *dd;
        dsl_dataset_t *ds;
        dsl_sync_task_group_t *dstg;
-       objset_impl_t *mosi = dp->dp_meta_objset->os;
+       objset_t *mos = dp->dp_meta_objset;
        hrtime_t start, write_time;
        uint64_t data_written;
        int err;
 
+       /*
+        * We need to copy dp_space_towrite() before doing
+        * dsl_sync_task_group_sync(), because
+        * dsl_dataset_snapshot_reserve_space() will increase
+        * dp_space_towrite but not actually write anything.
+        */
+       data_written = dp->dp_space_towrite[txg & TXG_MASK];
+
        tx = dmu_tx_create_assigned(dp, txg);
 
        dp->dp_read_overhead = 0;
@@ -323,11 +341,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
        for (ds = list_head(&dp->dp_synced_datasets); ds;
            ds = list_next(&dp->dp_synced_datasets, ds))
-               dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
+               dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 
        /*
         * Sync the datasets again to push out the changes due to
-        * userquota updates.  This must be done before we process the
+        * userspace updates.  This must be done before we process the
         * sync tasks, because that could cause a snapshot of a dataset
         * whose ds_bp will be rewritten when we do this 2nd sync.
         */
@@ -339,6 +357,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
        }
        err = zio_wait(zio);
 
+       /*
+        * Move dead blocks from the pending deadlist to the on-disk
+        * deadlist.
+        */
+       for (ds = list_head(&dp->dp_synced_datasets); ds;
+           ds = list_next(&dp->dp_synced_datasets, ds)) {
+               bplist_iterate(&ds->ds_pending_deadlist,
+                   deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+       }
+
        while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
                /*
                 * No more sync tasks should have been added while we
@@ -354,14 +382,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                dsl_dir_sync(dd, tx);
        write_time += gethrtime() - start;
 
-       if (spa_sync_pass(dp->dp_spa) == 1)
-               dsl_pool_scrub_sync(dp, tx);
-
        start = gethrtime();
-       if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
-           list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+       if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+           list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
                zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-               dmu_objset_sync(mosi, zio, tx);
+               dmu_objset_sync(mos, zio, tx);
                err = zio_wait(zio);
                ASSERT(err == 0);
                dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
@@ -374,7 +399,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
        dmu_tx_commit(tx);
 
-       data_written = dp->dp_space_towrite[txg & TXG_MASK];
        dp->dp_space_towrite[txg & TXG_MASK] = 0;
        ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 
@@ -399,10 +423,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
         * amount of write traffic allowed into each transaction group.
         * Weight the throughput calculation towards the current value:
         *      thru = 3/4 old_thru + 1/4 new_thru
+        *
+        * Note: write_time is in nanosecs, so write_time/MICROSEC
+        * yields millisecs
         */
        ASSERT(zfs_write_limit_min > 0);
-       if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
-               uint64_t throughput = (data_written * NANOSEC) / write_time;
+       if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
+               uint64_t throughput = data_written / (write_time / MICROSEC);
+
                if (dp->dp_throughput)
                        dp->dp_throughput = throughput / 4 +
                            3 * dp->dp_throughput / 4;
@@ -410,21 +438,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                        dp->dp_throughput = throughput;
                dp->dp_write_limit = MIN(zfs_write_limit_inflated,
                    MAX(zfs_write_limit_min,
-                   dp->dp_throughput * zfs_txg_synctime));
+                   dp->dp_throughput * zfs_txg_synctime_ms));
        }
 }
 
 void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
        dsl_dataset_t *ds;
+       objset_t *os;
 
        while (ds = list_head(&dp->dp_synced_datasets)) {
                list_remove(&dp->dp_synced_datasets, ds);
-               ASSERT(ds->ds_user_ptr != NULL);
-               zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+               os = ds->ds_objset;
+               zil_clean(os->os_zil);
+               ASSERT(!dmu_objset_is_dirty(os, txg));
                dmu_buf_rele(ds->ds_dbuf, ds);
        }
+       ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
@@ -601,6 +632,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
        ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
 
        if (prev->ds_phys->ds_next_clones_obj == 0) {
+               dmu_buf_will_dirty(prev->ds_dbuf, tx);
                prev->ds_phys->ds_next_clones_obj =
                    zap_create(dp->dp_meta_objset,
                    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
@@ -620,8 +652,67 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
        ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(dp->dp_origin_snap != NULL);
 
-       (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
-           tx, DS_FIND_CHILDREN);
+       VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+           tx, DS_FIND_CHILDREN));
+}
+
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+       dmu_tx_t *tx = arg;
+       dsl_dataset_t *ds;
+       dsl_pool_t *dp = spa_get_dsl(spa);
+       objset_t *mos = dp->dp_meta_objset;
+
+       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+       if (ds->ds_dir->dd_phys->dd_origin_obj) {
+               dsl_dataset_t *origin;
+
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+                   ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+
+               if (origin->ds_dir->dd_phys->dd_clones == 0) {
+                       dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+                       origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
+                           DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+               }
+
+               VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+                   origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+
+               dsl_dataset_rele(origin, FTAG);
+       }
+
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       ASSERT(dmu_tx_is_syncing(tx));
+       uint64_t obj;
+
+       (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+       VERIFY(0 == dsl_pool_open_special_dir(dp,
+           FREE_DIR_NAME, &dp->dp_free_dir));
+
+       /*
+        * We can't use bpobj_alloc(), because spa_version() still
+        * returns the old version, and we need a new-version bpobj with
+        * subobj support.  So call dmu_object_alloc() directly.
+        */
+       obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+           SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+       VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+       VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+           dp->dp_meta_objset, obj));
+
+       VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+           upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 }
 
 void
@@ -638,7 +729,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
        dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
            NULL, 0, kcred, tx);
        VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-       dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+       dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
        VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
            dp, &dp->dp_origin_snap));
        dsl_dataset_rele(ds, FTAG);
@@ -650,3 +741,108 @@ dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 {
        return (dp->dp_vnrele_taskq);
 }
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+       zap_attribute_t za;
+       zap_cursor_t zc;
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+
+       if (zapobj == 0)
+               return;
+       ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+       for (zap_cursor_init(&zc, mos, zapobj);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc)) {
+               char *htag;
+               uint64_t dsobj;
+
+               htag = strchr(za.za_name, '-');
+               *htag = '\0';
+               ++htag;
+               dsobj = strtonum(za.za_name, NULL);
+               (void) dsl_dataset_user_release_tmp(dp, dsobj, htag);
+       }
+       zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       objset_t *mos = dp->dp_meta_objset;
+
+       ASSERT(dp->dp_tmp_userrefs_obj == 0);
+       ASSERT(dmu_tx_is_syncing(tx));
+
+       dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
+           DMU_OT_NONE, 0, tx);
+
+       VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
+           sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+{
+       objset_t *mos = dp->dp_meta_objset;
+       uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+       char *name;
+       int error;
+
+       ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+       ASSERT(dmu_tx_is_syncing(tx));
+
+       /*
+        * If the pool was created prior to SPA_VERSION_USERREFS, the
+        * zap object for temporary holds might not exist yet.
+        */
+       if (zapobj == 0) {
+               if (holding) {
+                       dsl_pool_user_hold_create_obj(dp, tx);
+                       zapobj = dp->dp_tmp_userrefs_obj;
+               } else {
+                       return (ENOENT);
+               }
+       }
+
+       name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+       if (holding)
+               error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+       else
+               error = zap_remove(mos, zapobj, name, tx);
+       strfree(name);
+
+       return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    uint64_t *now, dmu_tx_t *tx)
+{
+       return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    dmu_tx_t *tx)
+{
+       return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+           tx, B_FALSE));
+}
index bfc0fa8..aa66b32 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+#include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 
 #include "zfs_prop.h"
 
+#define        ZPROP_INHERIT_SUFFIX "$inherit"
+#define        ZPROP_RECVD_SUFFIX "$recvd"
+
 static int
-dodefault(const char *propname, int intsz, int numint, void *buf)
+dodefault(const char *propname, int intsz, int numints, void *buf)
 {
        zfs_prop_t prop;
 
@@ -55,9 +57,9 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
                if (intsz != 1)
                        return (EOVERFLOW);
                (void) strncpy(buf, zfs_prop_default_string(prop),
-                   numint);
+                   numints);
        } else {
-               if (intsz != 8 || numint < 1)
+               if (intsz != 8 || numints < 1)
                        return (EOVERFLOW);
 
                *(uint64_t *)buf = zfs_prop_default_numeric(prop);
@@ -68,11 +70,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
 
 int
 dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numint, void *buf, char *setpoint)
+    int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
 {
        int err = ENOENT;
+       dsl_dir_t *target = dd;
        objset_t *mos = dd->dd_pool->dp_meta_objset;
        zfs_prop_t prop;
+       boolean_t inheritable;
+       boolean_t inheriting = B_FALSE;
+       char *inheritstr;
+       char *recvdstr;
 
        ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 
@@ -80,51 +87,135 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
                setpoint[0] = '\0';
 
        prop = zfs_name_to_prop(propname);
+       inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+       inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+       recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
        /*
-        * Note: dd may be NULL, therefore we shouldn't dereference it
-        * ouside this loop.
+        * Note: dd may become NULL, therefore we shouldn't dereference it
+        * after this loop.
         */
        for (; dd != NULL; dd = dd->dd_parent) {
                ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
-               err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
-                   propname, intsz, numint, buf);
+
+               if (dd != target || snapshot) {
+                       if (!inheritable)
+                               break;
+                       inheriting = B_TRUE;
+               }
+
+               /* Check for a local value. */
+               err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+                   intsz, numints, buf);
                if (err != ENOENT) {
-                       if (setpoint)
+                       if (setpoint != NULL && err == 0)
                                dsl_dir_name(dd, setpoint);
                        break;
                }
 
                /*
-                * Break out of this loop for non-inheritable properties.
+                * Skip the check for a received value if there is an explicit
+                * inheritance entry.
                 */
-               if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+               err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+                   inheritstr);
+               if (err != 0 && err != ENOENT)
                        break;
+
+               if (err == ENOENT) {
+                       /* Check for a received value. */
+                       err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+                           recvdstr, intsz, numints, buf);
+                       if (err != ENOENT) {
+                               if (setpoint != NULL && err == 0) {
+                                       if (inheriting) {
+                                               dsl_dir_name(dd, setpoint);
+                                       } else {
+                                               (void) strcpy(setpoint,
+                                                   ZPROP_SOURCE_VAL_RECVD);
+                                       }
+                               }
+                               break;
+                       }
+               }
+
+               /*
+                * If we found an explicit inheritance entry, err is zero even
+                * though we haven't yet found the value, so reinitializing err
+                * at the end of the loop (instead of at the beginning) ensures
+                * that err has a valid post-loop value.
+                */
+               err = ENOENT;
        }
+
        if (err == ENOENT)
-               err = dodefault(propname, intsz, numint, buf);
+               err = dodefault(propname, intsz, numints, buf);
+
+       strfree(inheritstr);
+       strfree(recvdstr);
 
        return (err);
 }
 
 int
 dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
-    int intsz, int numint, void *buf, char *setpoint)
+    int intsz, int numints, void *buf, char *setpoint)
 {
+       zfs_prop_t prop = zfs_name_to_prop(propname);
+       boolean_t inheritable;
+       boolean_t snapshot;
+       uint64_t zapobj;
+
        ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+       inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+       snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
+       zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+
+       if (zapobj != 0) {
+               objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+               int err;
 
-       if (ds->ds_phys->ds_props_obj) {
-               int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-                   ds->ds_phys->ds_props_obj, propname, intsz, numint, buf);
+               ASSERT(snapshot);
+
+               /* Check for a local value. */
+               err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
                if (err != ENOENT) {
-                       if (setpoint)
+                       if (setpoint != NULL && err == 0)
                                dsl_dataset_name(ds, setpoint);
                        return (err);
                }
+
+               /*
+                * Skip the check for a received value if there is an explicit
+                * inheritance entry.
+                */
+               if (inheritable) {
+                       char *inheritstr = kmem_asprintf("%s%s", propname,
+                           ZPROP_INHERIT_SUFFIX);
+                       err = zap_contains(mos, zapobj, inheritstr);
+                       strfree(inheritstr);
+                       if (err != 0 && err != ENOENT)
+                               return (err);
+               }
+
+               if (err == ENOENT) {
+                       /* Check for a received value. */
+                       char *recvdstr = kmem_asprintf("%s%s", propname,
+                           ZPROP_RECVD_SUFFIX);
+                       err = zap_lookup(mos, zapobj, recvdstr,
+                           intsz, numints, buf);
+                       strfree(recvdstr);
+                       if (err != ENOENT) {
+                               if (setpoint != NULL && err == 0)
+                                       (void) strcpy(setpoint,
+                                           ZPROP_SOURCE_VAL_RECVD);
+                               return (err);
+                       }
+               }
        }
 
        return (dsl_prop_get_dd(ds->ds_dir, propname,
-           intsz, numint, buf, setpoint));
+           intsz, numints, buf, setpoint, snapshot));
 }
 
 /*
@@ -168,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
 
        cbr->cbr_func(cbr->cbr_arg, value);
 
-       VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
-           NULL, cbr, &dd));
        if (need_rwlock)
                rw_exit(&dp->dp_config_rwlock);
-       /* Leave dir open until this callback is unregistered */
        return (0);
 }
 
@@ -210,6 +298,137 @@ dsl_prop_get_integer(const char *ddname, const char *propname,
        return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
 }
 
+void
+dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+    zprop_source_t source, uint64_t *value)
+{
+       psa->psa_name = propname;
+       psa->psa_source = source;
+       psa->psa_intsz = 8;
+       psa->psa_numints = 1;
+       psa->psa_value = value;
+
+       psa->psa_effective_value = -1ULL;
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+       const char *propname = psa->psa_name;
+       zfs_prop_t prop = zfs_name_to_prop(propname);
+       zprop_source_t source = psa->psa_source;
+       objset_t *mos;
+       uint64_t zapobj;
+       uint64_t version;
+       char *recvdstr;
+       int err = 0;
+
+       switch (prop) {
+       case ZFS_PROP_QUOTA:
+       case ZFS_PROP_RESERVATION:
+       case ZFS_PROP_REFQUOTA:
+       case ZFS_PROP_REFRESERVATION:
+               break;
+       default:
+               return (-1);
+       }
+
+       mos = dd->dd_pool->dp_meta_objset;
+       zapobj = dd->dd_phys->dd_props_zapobj;
+       recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+       version = spa_version(dd->dd_pool->dp_spa);
+       if (version < SPA_VERSION_RECVD_PROPS) {
+               if (source & ZPROP_SRC_NONE)
+                       source = ZPROP_SRC_NONE;
+               else if (source & ZPROP_SRC_RECEIVED)
+                       source = ZPROP_SRC_LOCAL;
+       }
+
+       switch (source) {
+       case ZPROP_SRC_NONE:
+               /* Revert to the received value, if any. */
+               err = zap_lookup(mos, zapobj, recvdstr, 8, 1,
+                   &psa->psa_effective_value);
+               if (err == ENOENT)
+                       psa->psa_effective_value = 0;
+               break;
+       case ZPROP_SRC_LOCAL:
+               psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+               break;
+       case ZPROP_SRC_RECEIVED:
+               /*
+                * If there's no local setting, then the new received value will
+                * be the effective value.
+                */
+               err = zap_lookup(mos, zapobj, propname, 8, 1,
+                   &psa->psa_effective_value);
+               if (err == ENOENT)
+                       psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+               break;
+       case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+               /*
+                * We're clearing the received value, so the local setting (if
+                * it exists) remains the effective value.
+                */
+               err = zap_lookup(mos, zapobj, propname, 8, 1,
+                   &psa->psa_effective_value);
+               if (err == ENOENT)
+                       psa->psa_effective_value = 0;
+               break;
+       default:
+               cmn_err(CE_PANIC, "unexpected property source: %d", source);
+       }
+
+       strfree(recvdstr);
+
+       if (err == ENOENT)
+               return (0);
+
+       return (err);
+}
+
+#ifdef ZFS_DEBUG
+void
+dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+       zfs_prop_t prop = zfs_name_to_prop(psa->psa_name);
+       uint64_t intval;
+       char setpoint[MAXNAMELEN];
+       uint64_t version = spa_version(dd->dd_pool->dp_spa);
+       int err;
+
+       if (version < SPA_VERSION_RECVD_PROPS) {
+               switch (prop) {
+               case ZFS_PROP_QUOTA:
+               case ZFS_PROP_RESERVATION:
+                       return;
+               }
+       }
+
+       err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval,
+           setpoint, B_FALSE);
+       if (err == 0 && intval != psa->psa_effective_value) {
+               cmn_err(CE_PANIC, "%s property, source: %x, "
+                   "predicted effective value: %llu, "
+                   "actual effective value: %llu (setpoint: %s)",
+                   psa->psa_name, psa->psa_source,
+                   (unsigned long long)psa->psa_effective_value,
+                   (unsigned long long)intval, setpoint);
+       }
+}
+#endif
+
 /*
  * Unregister this callback.  Return 0 on success, ENOENT if ddname is
  * invalid, ENOMSG if no matching callback registered.
@@ -241,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
        kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
        kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
 
-       /* Clean up from dsl_prop_register */
-       dsl_dir_close(dd, cbr);
        return (0);
 }
 
@@ -277,7 +494,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
        zap_cursor_t zc;
        zap_attribute_t *za;
        int err;
-       uint64_t dummyval;
 
        ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
        err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
@@ -289,8 +505,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
                 * If the prop is set here, then this change is not
                 * being inherited here or below; stop the recursion.
                 */
-               err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-                   8, 1, &dummyval);
+               err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
                if (err == 0) {
                        dsl_dir_close(dd, FTAG);
                        return;
@@ -310,8 +525,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
                 * If the property is set on this ds, then it is not
                 * inherited here; don't call the callback.
                 */
-               if (propobj && 0 == zap_lookup(mos, propobj, propname,
-                   8, 1, &dummyval))
+               if (propobj && 0 == zap_contains(mos, propobj, propname))
                        continue;
 
                cbr->cbr_func(cbr->cbr_arg, value);
@@ -331,30 +545,28 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
        dsl_dir_close(dd, FTAG);
 }
 
-struct prop_set_arg {
-       const char *name;
-       int intsz;
-       int numints;
-       const void *buf;
-};
-
-
-static void
-dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+void
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       struct prop_set_arg *psa = arg2;
+       dsl_prop_setarg_t *psa = arg2;
        objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-       uint64_t zapobj, intval;
+       uint64_t zapobj, intval, dummy;
        int isint;
        char valbuf[32];
-       char *valstr;
+       char *valstr = NULL;
+       char *inheritstr;
+       char *recvdstr;
+       char *tbuf = NULL;
+       int err;
+       uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+       const char *propname = psa->psa_name;
+       zprop_source_t source = psa->psa_source;
 
-       isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+       isint = (dodefault(propname, 8, 1, &intval) == 0);
 
-       if (dsl_dataset_is_snapshot(ds)) {
-               ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-                   SPA_VERSION_SNAP_PROPS);
+       if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+               ASSERT(version >= SPA_VERSION_SNAP_PROPS);
                if (ds->ds_phys->ds_props_obj == 0) {
                        dmu_buf_will_dirty(ds->ds_dbuf, tx);
                        ds->ds_phys->ds_props_obj =
@@ -366,22 +578,97 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
        }
 
-       if (psa->numints == 0) {
-               int err = zap_remove(mos, zapobj, psa->name, tx);
+       if (version < SPA_VERSION_RECVD_PROPS) {
+               zfs_prop_t prop = zfs_name_to_prop(propname);
+               if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION)
+                       return;
+
+               if (source & ZPROP_SRC_NONE)
+                       source = ZPROP_SRC_NONE;
+               else if (source & ZPROP_SRC_RECEIVED)
+                       source = ZPROP_SRC_LOCAL;
+       }
+
+       inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+       recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+       switch (source) {
+       case ZPROP_SRC_NONE:
+               /*
+                * revert to received value, if any (inherit -S)
+                * - remove propname
+                * - remove propname$inherit
+                */
+               err = zap_remove(mos, zapobj, propname, tx);
+               ASSERT(err == 0 || err == ENOENT);
+               err = zap_remove(mos, zapobj, inheritstr, tx);
                ASSERT(err == 0 || err == ENOENT);
-               if (isint) {
-                       VERIFY(0 == dsl_prop_get_ds(ds,
-                           psa->name, 8, 1, &intval, NULL));
+               break;
+       case ZPROP_SRC_LOCAL:
+               /*
+                * remove propname$inherit
+                * set propname -> value
+                */
+               err = zap_remove(mos, zapobj, inheritstr, tx);
+               ASSERT(err == 0 || err == ENOENT);
+               VERIFY(0 == zap_update(mos, zapobj, propname,
+                   psa->psa_intsz, psa->psa_numints, psa->psa_value, tx));
+               break;
+       case ZPROP_SRC_INHERITED:
+               /*
+                * explicitly inherit
+                * - remove propname
+                * - set propname$inherit
+                */
+               err = zap_remove(mos, zapobj, propname, tx);
+               ASSERT(err == 0 || err == ENOENT);
+               if (version >= SPA_VERSION_RECVD_PROPS &&
+                   dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy,
+                   NULL) == 0) {
+                       dummy = 0;
+                       err = zap_update(mos, zapobj, inheritstr,
+                           8, 1, &dummy, tx);
+                       ASSERT(err == 0);
                }
-       } else {
-               VERIFY(0 == zap_update(mos, zapobj, psa->name,
-                   psa->intsz, psa->numints, psa->buf, tx));
-               if (isint)
-                       intval = *(uint64_t *)psa->buf;
+               break;
+       case ZPROP_SRC_RECEIVED:
+               /*
+                * set propname$recvd -> value
+                */
+               err = zap_update(mos, zapobj, recvdstr,
+                   psa->psa_intsz, psa->psa_numints, psa->psa_value, tx);
+               ASSERT(err == 0);
+               break;
+       case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+               /*
+                * clear local and received settings
+                * - remove propname
+                * - remove propname$inherit
+                * - remove propname$recvd
+                */
+               err = zap_remove(mos, zapobj, propname, tx);
+               ASSERT(err == 0 || err == ENOENT);
+               err = zap_remove(mos, zapobj, inheritstr, tx);
+               ASSERT(err == 0 || err == ENOENT);
+               /* FALLTHRU */
+       case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+               /*
+                * remove propname$recvd
+                */
+               err = zap_remove(mos, zapobj, recvdstr, tx);
+               ASSERT(err == 0 || err == ENOENT);
+               break;
+       default:
+               cmn_err(CE_PANIC, "unexpected property source: %d", source);
        }
 
+       strfree(inheritstr);
+       strfree(recvdstr);
+
        if (isint) {
-               if (dsl_dataset_is_snapshot(ds)) {
+               VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL));
+
+               if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
                        dsl_prop_cb_record_t *cbr;
                        /*
                         * It's a snapshot; nothing can inherit this
@@ -392,58 +679,85 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                        for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
                            cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
                                if (cbr->cbr_ds == ds &&
-                                   strcmp(cbr->cbr_propname, psa->name) == 0)
+                                   strcmp(cbr->cbr_propname, propname) == 0)
                                        cbr->cbr_func(cbr->cbr_arg, intval);
                        }
                        mutex_exit(&ds->ds_dir->dd_lock);
                } else {
                        dsl_prop_changed_notify(ds->ds_dir->dd_pool,
-                           ds->ds_dir->dd_object, psa->name, intval, TRUE);
+                           ds->ds_dir->dd_object, propname, intval, TRUE);
                }
-       }
-       if (isint) {
+
                (void) snprintf(valbuf, sizeof (valbuf),
                    "%lld", (longlong_t)intval);
                valstr = valbuf;
        } else {
-               valstr = (char *)psa->buf;
+               if (source == ZPROP_SRC_LOCAL) {
+                       valstr = (char *)psa->psa_value;
+               } else {
+                       tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+                       if (dsl_prop_get_ds(ds, propname, 1,
+                           ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+                               valstr = tbuf;
+               }
        }
-       spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
-           LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
-           "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
+
+       spa_history_log_internal((source == ZPROP_SRC_NONE ||
+           source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
+           LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx,
+           "%s=%s dataset = %llu", propname,
+           (valstr == NULL ? "" : valstr), ds->ds_object);
+
+       if (tbuf != NULL)
+               kmem_free(tbuf, ZAP_MAXVALUELEN);
 }
 
 void
-dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
-       nvlist_t *nvl = arg2;
+       dsl_props_arg_t *pa = arg2;
+       nvlist_t *props = pa->pa_props;
+       dsl_prop_setarg_t psa;
        nvpair_t *elem = NULL;
 
-       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-               struct prop_set_arg psa;
+       psa.psa_source = pa->pa_source;
 
-               psa.name = nvpair_name(elem);
+       while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+               nvpair_t *pair = elem;
 
-               if (nvpair_type(elem) == DATA_TYPE_STRING) {
-                       VERIFY(nvpair_value_string(elem,
-                           (char **)&psa.buf) == 0);
-                       psa.intsz = 1;
-                       psa.numints = strlen(psa.buf) + 1;
+               psa.psa_name = nvpair_name(pair);
+
+               if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+                       /*
+                        * dsl_prop_get_all_impl() returns properties in this
+                        * format.
+                        */
+                       nvlist_t *attrs;
+                       VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+                       VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                           &pair) == 0);
+               }
+
+               if (nvpair_type(pair) == DATA_TYPE_STRING) {
+                       VERIFY(nvpair_value_string(pair,
+                           (char **)&psa.psa_value) == 0);
+                       psa.psa_intsz = 1;
+                       psa.psa_numints = strlen(psa.psa_value) + 1;
                } else {
                        uint64_t intval;
-                       VERIFY(nvpair_value_uint64(elem, &intval) == 0);
-                       psa.intsz = sizeof (intval);
-                       psa.numints = 1;
-                       psa.buf = &intval;
+                       VERIFY(nvpair_value_uint64(pair, &intval) == 0);
+                       psa.psa_intsz = sizeof (intval);
+                       psa.psa_numints = 1;
+                       psa.psa_value = &intval;
                }
-               dsl_prop_set_sync(ds, &psa, cr, tx);
+               dsl_prop_set_sync(ds, &psa, tx);
        }
 }
 
 void
 dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
        objset_t *mos = dd->dd_pool->dp_meta_objset;
        uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
@@ -454,19 +768,19 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
 
        dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
 
-       spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+       spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx,
            "%s=%llu dataset = %llu", name, (u_longlong_t)val,
            dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
-dsl_prop_set(const char *dsname, const char *propname,
+dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source,
     int intsz, int numints, const void *buf)
 {
        dsl_dataset_t *ds;
        uint64_t version;
        int err;
-       struct prop_set_arg psa;
+       dsl_prop_setarg_t psa;
 
        /*
         * We must do these checks before we get to the syncfunc, since
@@ -491,10 +805,13 @@ dsl_prop_set(const char *dsname, const char *propname,
                return (ENOTSUP);
        }
 
-       psa.name = propname;
-       psa.intsz = intsz;
-       psa.numints = numints;
-       psa.buf = buf;
+       psa.psa_name = propname;
+       psa.psa_source = source;
+       psa.psa_intsz = intsz;
+       psa.psa_numints = numints;
+       psa.psa_value = buf;
+       psa.psa_effective_value = -1ULL;
+
        err = dsl_sync_task_do(ds->ds_dir->dd_pool,
            NULL, dsl_prop_set_sync, ds, &psa, 2);
 
@@ -503,11 +820,12 @@ dsl_prop_set(const char *dsname, const char *propname,
 }
 
 int
-dsl_props_set(const char *dsname, nvlist_t *nvl)
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
 {
        dsl_dataset_t *ds;
        uint64_t version;
        nvpair_t *elem = NULL;
+       dsl_props_arg_t pa;
        int err;
 
        if (err = dsl_dataset_hold(dsname, FTAG, &ds))
@@ -516,7 +834,7 @@ dsl_props_set(const char *dsname, nvlist_t *nvl)
         * Do these checks before the syncfunc, since it can't fail.
         */
        version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+       while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
                if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
                        dsl_dataset_rele(ds, FTAG);
                        return (ENAMETOOLONG);
@@ -539,129 +857,281 @@ dsl_props_set(const char *dsname, nvlist_t *nvl)
                return (ENOTSUP);
        }
 
+       pa.pa_props = props;
+       pa.pa_source = source;
+
        err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-           NULL, dsl_props_set_sync, ds, nvl, 2);
+           NULL, dsl_props_set_sync, ds, &pa, 2);
 
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
+typedef enum dsl_prop_getflags {
+       DSL_PROP_GET_INHERITING = 0x1,  /* searching parent of target ds */
+       DSL_PROP_GET_SNAPSHOT = 0x2,    /* snapshot dataset */
+       DSL_PROP_GET_LOCAL = 0x4,       /* local properties */
+       DSL_PROP_GET_RECEIVED = 0x8     /* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+    const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       int err = 0;
+
+       for (zap_cursor_init(&zc, mos, propobj);
+           (err = zap_cursor_retrieve(&zc, &za)) == 0;
+           zap_cursor_advance(&zc)) {
+               nvlist_t *propval;
+               zfs_prop_t prop;
+               char buf[ZAP_MAXNAMELEN];
+               char *valstr;
+               const char *suffix;
+               const char *propname;
+               const char *source;
+
+               suffix = strchr(za.za_name, '$');
+
+               if (suffix == NULL) {
+                       /*
+                        * Skip local properties if we only want received
+                        * properties.
+                        */
+                       if (flags & DSL_PROP_GET_RECEIVED)
+                               continue;
+
+                       propname = za.za_name;
+                       source = setpoint;
+               } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+                       /* Skip explicitly inherited entries. */
+                       continue;
+               } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+                       if (flags & DSL_PROP_GET_LOCAL)
+                               continue;
+
+                       (void) strncpy(buf, za.za_name, (suffix - za.za_name));
+                       buf[suffix - za.za_name] = '\0';
+                       propname = buf;
+
+                       if (!(flags & DSL_PROP_GET_RECEIVED)) {
+                               /* Skip if locally overridden. */
+                               err = zap_contains(mos, propobj, propname);
+                               if (err == 0)
+                                       continue;
+                               if (err != ENOENT)
+                                       break;
+
+                               /* Skip if explicitly inherited. */
+                               valstr = kmem_asprintf("%s%s", propname,
+                                   ZPROP_INHERIT_SUFFIX);
+                               err = zap_contains(mos, propobj, valstr);
+                               strfree(valstr);
+                               if (err == 0)
+                                       continue;
+                               if (err != ENOENT)
+                                       break;
+                       }
+
+                       source = ((flags & DSL_PROP_GET_INHERITING) ?
+                           setpoint : ZPROP_SOURCE_VAL_RECVD);
+               } else {
+                       /*
+                        * For backward compatibility, skip suffixes we don't
+                        * recognize.
+                        */
+                       continue;
+               }
+
+               prop = zfs_name_to_prop(propname);
+
+               /* Skip non-inheritable properties. */
+               if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+                   !zfs_prop_inheritable(prop))
+                       continue;
+
+               /* Skip properties not valid for this type. */
+               if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+                   !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+                       continue;
+
+               /* Skip properties already defined. */
+               if (nvlist_exists(nv, propname))
+                       continue;
+
+               VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+               if (za.za_integer_length == 1) {
+                       /*
+                        * String property
+                        */
+                       char *tmp = kmem_alloc(za.za_num_integers,
+                           KM_SLEEP);
+                       err = zap_lookup(mos, propobj,
+                           za.za_name, 1, za.za_num_integers, tmp);
+                       if (err != 0) {
+                               kmem_free(tmp, za.za_num_integers);
+                               break;
+                       }
+                       VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+                           tmp) == 0);
+                       kmem_free(tmp, za.za_num_integers);
+               } else {
+                       /*
+                        * Integer property
+                        */
+                       ASSERT(za.za_integer_length == 8);
+                       (void) nvlist_add_uint64(propval, ZPROP_VALUE,
+                           za.za_first_integer);
+               }
+
+               VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+               VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+               nvlist_free(propval);
+       }
+       zap_cursor_fini(&zc);
+       if (err == ENOENT)
+               err = 0;
+       return (err);
+}
+
 /*
  * Iterate over all properties for this dataset and return them in an nvlist.
  */
-int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local)
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+    dsl_prop_getflags_t flags)
 {
-       dsl_dataset_t *ds = os->os->os_dsl_dataset;
        dsl_dir_t *dd = ds->ds_dir;
-       boolean_t snapshot = dsl_dataset_is_snapshot(ds);
-       int err = 0;
        dsl_pool_t *dp = dd->dd_pool;
        objset_t *mos = dp->dp_meta_objset;
-       uint64_t propobj = ds->ds_phys->ds_props_obj;
+       int err = 0;
+       char setpoint[MAXNAMELEN];
 
        VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-       if (local && snapshot && !propobj)
-               return (0);
+       if (dsl_dataset_is_snapshot(ds))
+               flags |= DSL_PROP_GET_SNAPSHOT;
 
        rw_enter(&dp->dp_config_rwlock, RW_READER);
-       while (dd != NULL) {
-               char setpoint[MAXNAMELEN];
-               zap_cursor_t zc;
-               zap_attribute_t za;
-               dsl_dir_t *dd_next;
-
-               if (propobj) {
-                       dsl_dataset_name(ds, setpoint);
-                       dd_next = dd;
-               } else {
-                       dsl_dir_name(dd, setpoint);
-                       propobj = dd->dd_phys->dd_props_zapobj;
-                       dd_next = dd->dd_parent;
+
+       if (ds->ds_phys->ds_props_obj != 0) {
+               ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+               dsl_dataset_name(ds, setpoint);
+               err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
+                   setpoint, flags, *nvp);
+               if (err)
+                       goto out;
+       }
+
+       for (; dd != NULL; dd = dd->dd_parent) {
+               if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+                       if (flags & (DSL_PROP_GET_LOCAL |
+                           DSL_PROP_GET_RECEIVED))
+                               break;
+                       flags |= DSL_PROP_GET_INHERITING;
                }
+               dsl_dir_name(dd, setpoint);
+               err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
+                   setpoint, flags, *nvp);
+               if (err)
+                       break;
+       }
+out:
+       rw_exit(&dp->dp_config_rwlock);
+       return (err);
+}
 
-               for (zap_cursor_init(&zc, mos, propobj);
-                   (err = zap_cursor_retrieve(&zc, &za)) == 0;
-                   zap_cursor_advance(&zc)) {
-                       nvlist_t *propval;
-                       zfs_prop_t prop = zfs_name_to_prop(za.za_name);
+boolean_t
+dsl_prop_get_hasrecvd(objset_t *os)
+{
+       dsl_dataset_t *ds = os->os_dsl_dataset;
+       int rc;
+       uint64_t dummy;
 
-                       /* Skip non-inheritable properties. */
-                       if (prop != ZPROP_INVAL &&
-                           !zfs_prop_inheritable(prop) &&
-                           (dd != ds->ds_dir || (snapshot && dd != dd_next)))
-                               continue;
+       rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+       rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL);
+       rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+       ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+       return (rc == 0);
+}
 
-                       /* Skip properties not valid for this type. */
-                       if (snapshot && prop != ZPROP_INVAL &&
-                           !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
-                               continue;
+static void
+dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source)
+{
+       dsl_dataset_t *ds = os->os_dsl_dataset;
+       uint64_t dummy = 0;
+       dsl_prop_setarg_t psa;
 
-                       /* Skip properties already defined */
-                       if (nvlist_lookup_nvlist(*nvp, za.za_name,
-                           &propval) == 0)
-                               continue;
+       if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS)
+               return;
 
-                       VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
-                           KM_SLEEP) == 0);
-                       if (za.za_integer_length == 1) {
-                               /*
-                                * String property
-                                */
-                               char *tmp = kmem_alloc(za.za_num_integers,
-                                   KM_SLEEP);
-                               err = zap_lookup(mos, propobj,
-                                   za.za_name, 1, za.za_num_integers, tmp);
-                               if (err != 0) {
-                                       kmem_free(tmp, za.za_num_integers);
-                                       break;
-                               }
-                               VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
-                                   tmp) == 0);
-                               kmem_free(tmp, za.za_num_integers);
-                       } else {
-                               /*
-                                * Integer property
-                                */
-                               ASSERT(za.za_integer_length == 8);
-                               (void) nvlist_add_uint64(propval, ZPROP_VALUE,
-                                   za.za_first_integer);
-                       }
+       dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy);
 
-                       VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
-                           setpoint) == 0);
-                       VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
-                           propval) == 0);
-                       nvlist_free(propval);
-               }
-               zap_cursor_fini(&zc);
+       (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL,
+           dsl_prop_set_sync, ds, &psa, 2);
+}
 
-               if (err != ENOENT)
-                       break;
-               err = 0;
-               /*
-                * If we are just after the props that have been set
-                * locally, then we are done after the first iteration.
-                */
-               if (local)
-                       break;
-               dd = dd_next;
-               propobj = 0;
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+void
+dsl_prop_set_hasrecvd(objset_t *os)
+{
+       if (dsl_prop_get_hasrecvd(os)) {
+               ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+               return;
        }
-       rw_exit(&dp->dp_config_rwlock);
+       dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL);
+}
 
-       return (err);
+void
+dsl_prop_unset_hasrecvd(objset_t *os)
+{
+       dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE);
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+       return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(objset_t *os, nvlist_t **nvp)
+{
+       /*
+        * Received properties are not distinguishable from local properties
+        * until the dataset has received properties on or after
+        * SPA_VERSION_RECVD_PROPS.
+        */
+       dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ?
+           DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+       return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags));
 }
 
 void
 dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
 {
        nvlist_t *propval;
+       const char *propname = zfs_prop_to_name(prop);
+       uint64_t default_value;
+
+       if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+               VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+               return;
+       }
 
        VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
        VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
-       VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+       /* Indicate the default source if we can. */
+       if (dodefault(propname, 8, 1, &default_value) == 0 &&
+           value == default_value) {
+               VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+       }
+       VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
        nvlist_free(propval);
 }
 
@@ -669,9 +1139,15 @@ void
 dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
 {
        nvlist_t *propval;
+       const char *propname = zfs_prop_to_name(prop);
+
+       if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+               VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+               return;
+       }
 
        VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
        VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
-       VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+       VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
        nvlist_free(propval);
 }
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
new file mode 100644 (file)
index 0000000..23c37c7
--- /dev/null
@@ -0,0 +1,1739 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scan_cb_t dsl_scan_defrag_cb;
+static scan_cb_t dsl_scan_scrub_cb;
+static scan_cb_t dsl_scan_remove_cb;
+static dsl_syncfunc_t dsl_scan_cancel_sync;
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+
+#define        DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+       ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+       (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+       NULL,
+       dsl_scan_scrub_cb,      /* POOL_SCAN_SCRUB */
+       dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+       int err;
+       dsl_scan_t *scn;
+       spa_t *spa = dp->dp_spa;
+       uint64_t f;
+
+       scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+       scn->scn_dp = dp;
+
+       err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           "scrub_func", sizeof (uint64_t), 1, &f);
+       if (err == 0) {
+               /*
+                * There was an old-style scrub in progress.  Restart a
+                * new-style scrub from the beginning.
+                */
+               scn->scn_restart_txg = txg;
+               zfs_dbgmsg("old-style scrub was in progress; "
+                   "restarting new-style scrub in txg %llu",
+                   scn->scn_restart_txg);
+
+               /*
+                * Load the queue obj from the old location so that it
+                * can be freed by dsl_scan_done().
+                */
+               (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   "scrub_queue", sizeof (uint64_t), 1,
+                   &scn->scn_phys.scn_queue_obj);
+       } else {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+                   &scn->scn_phys);
+               if (err == ENOENT)
+                       return (0);
+               else if (err)
+                       return (err);
+
+               if (scn->scn_phys.scn_state == DSS_SCANNING &&
+                   spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+                       /*
+                        * A new-type scrub was in progress on an old
+                        * pool, and the pool was accessed by old
+                        * software.  Restart from the beginning, since
+                        * the old software may have changed the pool in
+                        * the meantime.
+                        */
+                       scn->scn_restart_txg = txg;
+                       zfs_dbgmsg("new-style scrub was modified "
+                           "by old software; restarting in txg %llu",
+                           scn->scn_restart_txg);
+               }
+       }
+
+       spa_scan_stat_init(spa);
+       return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+       if (dp->dp_scan) {
+               kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+               dp->dp_scan = NULL;
+       }
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg1;
+
+       if (scn->scn_phys.scn_state == DSS_SCANNING)
+               return (EBUSY);
+
+       return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg1;
+       pool_scan_func_t *funcp = arg2;
+       dmu_object_type_t ot = 0;
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+
+       ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+       ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+       bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+       scn->scn_phys.scn_func = *funcp;
+       scn->scn_phys.scn_state = DSS_SCANNING;
+       scn->scn_phys.scn_min_txg = 0;
+       scn->scn_phys.scn_max_txg = tx->tx_txg;
+       scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+       scn->scn_phys.scn_start_time = gethrestime_sec();
+       scn->scn_phys.scn_errors = 0;
+       scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+       scn->scn_restart_txg = 0;
+       spa_scan_stat_init(spa);
+
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+               scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+               /* rewrite all disk labels */
+               vdev_config_dirty(spa->spa_root_vdev);
+
+               if (vdev_resilver_needed(spa->spa_root_vdev,
+                   &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+                       spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+               } else {
+                       spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+               }
+
+               spa->spa_scrub_started = B_TRUE;
+               /*
+                * If this is an incremental scrub, limit the DDT scrub phase
+                * to just the auto-ditto class (for correctness); the rest
+                * of the scrub should go faster using top-down pruning.
+                */
+               if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+                       scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+       }
+
+       /* back to the generic stuff */
+
+       if (dp->dp_blkstats == NULL) {
+               dp->dp_blkstats =
+                   kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+       }
+       bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+       if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+               ot = DMU_OT_ZAP_OTHER;
+
+       scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+           ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+       dsl_scan_sync_state(scn, tx);
+
+       spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+           "func=%u mintxg=%llu maxtxg=%llu",
+           *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+       static const char *old_names[] = {
+               "scrub_bookmark",
+               "scrub_ddt_bookmark",
+               "scrub_ddt_class_max",
+               "scrub_queue",
+               "scrub_min_txg",
+               "scrub_max_txg",
+               "scrub_func",
+               "scrub_errors",
+               NULL
+       };
+
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+       int i;
+
+       /* Remove any remnants of an old-style scrub. */
+       for (i = 0; old_names[i]; i++) {
+               (void) zap_remove(dp->dp_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+       }
+
+       if (scn->scn_phys.scn_queue_obj != 0) {
+               VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, tx));
+               scn->scn_phys.scn_queue_obj = 0;
+       }
+
+       /*
+        * If we were "restarted" from a stopped state, don't bother
+        * with anything else.
+        */
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+       if (complete)
+               scn->scn_phys.scn_state = DSS_FINISHED;
+       else
+               scn->scn_phys.scn_state = DSS_CANCELED;
+
+       spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+           "complete=%u", complete);
+
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+               mutex_enter(&spa->spa_scrub_lock);
+               while (spa->spa_scrub_inflight > 0) {
+                       cv_wait(&spa->spa_scrub_io_cv,
+                           &spa->spa_scrub_lock);
+               }
+               mutex_exit(&spa->spa_scrub_lock);
+               spa->spa_scrub_started = B_FALSE;
+               spa->spa_scrub_active = B_FALSE;
+
+               /*
+                * If the scrub/resilver completed, update all DTLs to
+                * reflect this.  Whether it succeeded or not, vacate
+                * all temporary scrub DTLs.
+                */
+               vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+                   complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+               if (complete) {
+                       spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+                           ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+               }
+               spa_errlog_rotate(spa);
+
+               /*
+                * We may have finished replacing a device.
+                * Let the async thread assess this and handle the detach.
+                */
+               spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+       }
+
+       scn->scn_phys.scn_end_time = gethrestime_sec();
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg1;
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return (ENOENT);
+       return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg1;
+
+       dsl_scan_done(scn, B_FALSE, tx);
+       dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+       boolean_t complete = B_FALSE;
+       int err;
+
+       err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+           dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+       return (err);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp,
+    const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+    dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+    dmu_tx_t *tx);
+static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+       zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+       ASSERT(dsl_pool_sync_context(dp));
+       zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+int
+dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
+{
+       return (arc_read(pio, spa, bpp, pbuf, done, private,
+           priority, zio_flags, arc_flags, zb));
+}
+
+int
+dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
+{
+       return (arc_read_nolock(pio, spa, bpp, done, private,
+           priority, zio_flags, arc_flags, zb));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+       return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+           zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+    const zbookmark_t *zb2)
+{
+       uint64_t zb1nextL0, zb2thisobj;
+
+       ASSERT(zb1->zb_objset == zb2->zb_objset);
+       ASSERT(zb2->zb_level == 0);
+
+       /*
+        * A bookmark in the deadlist is considered to be after
+        * everything else.
+        */
+       if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+               return (B_TRUE);
+
+       /* The objset_phys_t isn't before anything. */
+       if (dnp == NULL)
+               return (B_FALSE);
+
+       zb1nextL0 = (zb1->zb_blkid + 1) <<
+           ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+       zb2thisobj = zb2->zb_object ? zb2->zb_object :
+           zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+       if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+               uint64_t nextobj = zb1nextL0 *
+                   (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+               return (nextobj <= zb2thisobj);
+       }
+
+       if (zb1->zb_object < zb2thisobj)
+               return (B_TRUE);
+       if (zb1->zb_object > zb2thisobj)
+               return (B_FALSE);
+       if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+               return (B_FALSE);
+       return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+       uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+       if (dsl_dataset_is_snapshot(ds))
+               return (MIN(smt, ds->ds_phys->ds_creation_txg));
+       return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+       VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+           DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+           &scn->scn_phys, tx));
+}
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+{
+       uint64_t elapsed_nanosecs;
+       int mintime;
+
+       /* we never skip user/group accounting objects */
+       if (zb && (int64_t)zb->zb_object < 0)
+               return (B_FALSE);
+
+       if (scn->scn_pausing)
+               return (B_TRUE); /* we're already pausing */
+
+       if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+               return (B_FALSE); /* we're resuming */
+
+       /* We only know how to resume from level-0 blocks. */
+       if (zb && zb->zb_level != 0)
+               return (B_FALSE);
+
+       mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+           zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+       elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+       if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+           (elapsed_nanosecs / MICROSEC > mintime &&
+           txg_sync_waiting(scn->scn_dp)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa)) {
+               if (zb) {
+                       dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+                           (longlong_t)zb->zb_objset,
+                           (longlong_t)zb->zb_object,
+                           (longlong_t)zb->zb_level,
+                           (longlong_t)zb->zb_blkid);
+                       scn->scn_phys.scn_bookmark = *zb;
+               }
+               dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+               scn->scn_pausing = B_TRUE;
+               return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+       dsl_pool_t      *zsa_dp;
+       zil_header_t    *zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+       zil_scan_arg_t *zsa = arg;
+       dsl_pool_t *dp = zsa->zsa_dp;
+       dsl_scan_t *scn = dp->dp_scan;
+       zil_header_t *zh = zsa->zsa_zh;
+       zbookmark_t zb;
+
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+               return (0);
+
+       /*
+        * One block ("stubby") can be allocated a long time ago; we
+        * want to visit that one because it has been allocated
+        * (on-disk) even if it hasn't been claimed (even though for
+        * scrub there's nothing to do to it).
+        */
+       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+               return (0);
+
+       SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+           ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+       VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+       return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+       if (lrc->lrc_txtype == TX_WRITE) {
+               zil_scan_arg_t *zsa = arg;
+               dsl_pool_t *dp = zsa->zsa_dp;
+               dsl_scan_t *scn = dp->dp_scan;
+               zil_header_t *zh = zsa->zsa_zh;
+               lr_write_t *lr = (lr_write_t *)lrc;
+               blkptr_t *bp = &lr->lr_blkptr;
+               zbookmark_t zb;
+
+               if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+                       return (0);
+
+               /*
+                * birth can be < claim_txg if this record's txg is
+                * already txg sync'ed (but this log block contains
+                * other records that are not synced)
+                */
+               if (claim_txg == 0 || bp->blk_birth < claim_txg)
+                       return (0);
+
+               SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+                   lr->lr_foid, ZB_ZIL_LEVEL,
+                   lr->lr_offset / BP_GET_LSIZE(bp));
+
+               VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+       }
+       return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+       uint64_t claim_txg = zh->zh_claim_txg;
+       zil_scan_arg_t zsa = { dp, zh };
+       zilog_t *zilog;
+
+       /*
+        * We only want to visit blocks that have been claimed but not yet
+        * replayed (or, in read-only mode, blocks that *would* be claimed).
+        */
+       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+               return;
+
+       zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+       (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+           claim_txg);
+
+       zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+    uint64_t objset, uint64_t object, uint64_t blkid)
+{
+       zbookmark_t czb;
+       uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+       if (zfs_no_scrub_prefetch)
+               return;
+
+       if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+           (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+               return;
+
+       SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+       /*
+        * XXX need to make sure all of these arc_read() prefetches are
+        * done before setting xlateall (similar to dsl_read())
+        */
+       (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+           buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+           &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+    const zbookmark_t *zb)
+{
+       /*
+        * We never skip over user/group accounting objects (obj<0)
+        */
+       if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+           (int64_t)zb->zb_object >= 0) {
+               /*
+                * If we already visited this bp & everything below (in
+                * a prior txg sync), don't bother doing it again.
+                */
+               if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+                       return (B_TRUE);
+
+               /*
+                * If we found the block we're trying to resume from, or
+                * we went past it to a different object, zero it out to
+                * indicate that it's OK to start checking for pausing
+                * again.
+                */
+               if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+                   zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+                       dprintf("resuming at %llx/%llx/%llx/%llx\n",
+                           (longlong_t)zb->zb_objset,
+                           (longlong_t)zb->zb_object,
+                           (longlong_t)zb->zb_level,
+                           (longlong_t)zb->zb_blkid);
+                       bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+               }
+       }
+       return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       int err;
+
+       if (BP_GET_LEVEL(bp) > 0) {
+               uint32_t flags = ARC_WAIT;
+               int i;
+               blkptr_t *cbp;
+               int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+               for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+                       dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+                           zb->zb_object, zb->zb_blkid * epb + i);
+               }
+               for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+                       zbookmark_t czb;
+
+                       SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+                           zb->zb_level - 1,
+                           zb->zb_blkid * epb + i);
+                       dsl_scan_visitbp(cbp, &czb, dnp,
+                           *bufp, ds, scn, ostype, tx);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+               uint32_t flags = ARC_WAIT;
+
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+               uint32_t flags = ARC_WAIT;
+               dnode_phys_t *cdnp;
+               int i, j;
+               int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+               for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+                       for (j = 0; j < cdnp->dn_nblkptr; j++) {
+                               blkptr_t *cbp = &cdnp->dn_blkptr[j];
+                               dsl_scan_prefetch(scn, *bufp, cbp,
+                                   zb->zb_objset, zb->zb_blkid * epb + i, j);
+                       }
+               }
+               for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+                       dsl_scan_visitdnode(scn, ds, ostype,
+                           cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+               }
+
+       } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+               uint32_t flags = ARC_WAIT;
+               objset_phys_t *osp;
+
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+
+               osp = (*bufp)->b_data;
+
+               if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
+                       dsl_scan_zil(dp, &osp->os_zil_header);
+
+               dsl_scan_visitdnode(scn, ds, osp->os_type,
+                   &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+
+               if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+                       /*
+                        * We also always visit user/group accounting
+                        * objects, and never skip them, even if we are
+                        * pausing.  This is necessary so that the space
+                        * deltas from this txg get integrated.
+                        */
+                       dsl_scan_visitdnode(scn, ds, osp->os_type,
+                           &osp->os_groupused_dnode, *bufp,
+                           DMU_GROUPUSED_OBJECT, tx);
+                       dsl_scan_visitdnode(scn, ds, osp->os_type,
+                           &osp->os_userused_dnode, *bufp,
+                           DMU_USERUSED_OBJECT, tx);
+               }
+       }
+
+       return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t object, dmu_tx_t *tx)
+{
+       int j;
+
+       for (j = 0; j < dnp->dn_nblkptr; j++) {
+               zbookmark_t czb;
+
+               SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+                   dnp->dn_nlevels - 1, j);
+               dsl_scan_visitbp(&dnp->dn_blkptr[j],
+                   &czb, dnp, buf, ds, scn, ostype, tx);
+       }
+
+       if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               zbookmark_t czb;
+               SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+                   0, DMU_SPILL_BLKID);
+               dsl_scan_visitbp(&dnp->dn_spill,
+                   &czb, dnp, buf, ds, scn, ostype, tx);
+       }
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+    dnode_phys_t *dnp, arc_buf_t *pbuf,
+    dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+    dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       arc_buf_t *buf = NULL;
+       blkptr_t bp_toread = *bp;
+
+       /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+       if (dsl_scan_check_pause(scn, zb))
+               return;
+
+       if (dsl_scan_check_resume(scn, dnp, zb))
+               return;
+
+       if (bp->blk_birth == 0)
+               return;
+
+       scn->scn_visited_this_txg++;
+
+       dprintf_bp(bp,
+           "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+           ds, ds ? ds->ds_object : 0,
+           zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+           pbuf, bp);
+
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+               return;
+
+       if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+               /*
+                * For non-user-accounting blocks, we need to read the
+                * new bp (from a deleted snapshot, found in
+                * check_existing_xlation).  If we used the old bp,
+                * pointers inside this block from before we resumed
+                * would be untranslated.
+                *
+                * For user-accounting blocks, we need to read the old
+                * bp, because we will apply the entire space delta to
+                * it (original untranslated -> translations from
+                * deleted snap -> now).
+                */
+               bp_toread = *bp;
+       }
+
+       if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+           &buf) != 0)
+               return;
+
+       /*
+        * If dsl_scan_ddt() has aready visited this block, it will have
+        * already done any translations or scrubbing, so don't call the
+        * callback again.
+        */
+       if (ddt_class_contains(dp->dp_spa,
+           scn->scn_phys.scn_ddt_class_max, bp)) {
+               ASSERT(buf == NULL);
+               return;
+       }
+
+       /*
+        * If this block is from the future (after cur_max_txg), then we
+        * are doing this on behalf of a deleted snapshot, and we will
+        * revisit the future block on the next pass of this dataset.
+        * Don't scan it now unless we need to because something
+        * under it was modified.
+        */
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+               scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+       }
+       if (buf)
+               (void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_tx_t *tx)
+{
+       zbookmark_t zb;
+
+       SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+       dsl_scan_visitbp(bp, &zb, NULL, NULL,
+           ds, scn, DMU_OST_NONE, tx);
+
+       dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+       if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+               if (dsl_dataset_is_snapshot(ds)) {
+                       /* Note, scn_cur_{min,max}_txg stays the same. */
+                       scn->scn_phys.scn_bookmark.zb_objset =
+                           ds->ds_phys->ds_next_snap_obj;
+                       zfs_dbgmsg("destroying ds %llu; currently traversing; "
+                           "reset zb_objset to %llu",
+                           (u_longlong_t)ds->ds_object,
+                           (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+                       scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+               } else {
+                       SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+                           ZB_DESTROYED_OBJSET, 0, 0, 0);
+                       zfs_dbgmsg("destroying ds %llu; currently traversing; "
+                           "reset bookmark to -1,0,0,0",
+                           (u_longlong_t)ds->ds_object);
+               }
+       } else if (zap_lookup_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+               ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+               if (dsl_dataset_is_snapshot(ds)) {
+                       /*
+                        * We keep the same mintxg; it could be >
+                        * ds_creation_txg if the previous snapshot was
+                        * deleted too.
+                        */
+                       VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                           scn->scn_phys.scn_queue_obj,
+                           ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+                       zfs_dbgmsg("destroying ds %llu; in queue; "
+                           "replacing with %llu",
+                           (u_longlong_t)ds->ds_object,
+                           (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+               } else {
+                       zfs_dbgmsg("destroying ds %llu; in queue; removing",
+                           (u_longlong_t)ds->ds_object);
+               }
+       } else {
+               zfs_dbgmsg("destroying ds %llu; ignoring",
+                   (u_longlong_t)ds->ds_object);
+       }
+
+       /*
+        * dsl_scan_sync() should be called after this, and should sync
+        * out our changed state, but just to be safe, do it here.
+        */
+       dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+       ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+       if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+               scn->scn_phys.scn_bookmark.zb_objset =
+                   ds->ds_phys->ds_prev_snap_obj;
+               zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds->ds_object,
+                   (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+       } else if (zap_lookup_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj,
+                   ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+               zfs_dbgmsg("snapshotting ds %llu; in queue; "
+                   "replacing with %llu",
+                   (u_longlong_t)ds->ds_object,
+                   (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+       }
+       dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+       if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+               scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+               zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds1->ds_object,
+                   (u_longlong_t)ds2->ds_object);
+       } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+               scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+               zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds2->ds_object,
+                   (u_longlong_t)ds1->ds_object);
+       }
+
+       if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds1->ds_object, &mintxg) == 0) {
+               int err;
+
+               ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+               err = zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+               VERIFY(err == 0 || err == EEXIST);
+               if (err == EEXIST) {
+                       /* Both were there to begin with */
+                       VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+                           scn->scn_phys.scn_queue_obj,
+                           ds1->ds_object, mintxg, tx));
+               }
+               zfs_dbgmsg("clone_swap ds %llu; in queue; "
+                   "replacing with %llu",
+                   (u_longlong_t)ds1->ds_object,
+                   (u_longlong_t)ds2->ds_object);
+       } else if (zap_lookup_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+               ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+               VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+               zfs_dbgmsg("clone_swap ds %llu; in queue; "
+                   "replacing with %llu",
+                   (u_longlong_t)ds2->ds_object,
+                   (u_longlong_t)ds1->ds_object);
+       }
+
+       dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+       dmu_tx_t *tx;
+       uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+       struct enqueue_clones_arg *eca = arg;
+       dsl_dataset_t *ds;
+       int err;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       if (err)
+               return (err);
+
+       if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+               while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+                       dsl_dataset_t *prev;
+                       err = dsl_dataset_hold_obj(dp,
+                           ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+                       dsl_dataset_rele(ds, FTAG);
+                       if (err)
+                               return (err);
+                       ds = prev;
+               }
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object,
+                   ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+       }
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       dsl_dataset_t *ds;
+
+       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+       /*
+        * Iterate over the bps in this ds.
+        */
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+
+       char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+       dsl_dataset_name(ds, dsname);
+       zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+           "pausing=%u",
+           (longlong_t)dsobj, dsname,
+           (longlong_t)scn->scn_phys.scn_cur_min_txg,
+           (longlong_t)scn->scn_phys.scn_cur_max_txg,
+           (int)scn->scn_pausing);
+       kmem_free(dsname, ZFS_MAXNAMELEN);
+
+       if (scn->scn_pausing)
+               goto out;
+
+       /*
+        * We've finished this pass over this dataset.
+        */
+
+       /*
+        * If we did not completely visit this dataset, do another pass.
+        */
+       if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+               zfs_dbgmsg("incomplete pass; visiting again");
+               scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object,
+                   scn->scn_phys.scn_cur_max_txg, tx) == 0);
+               goto out;
+       }
+
+       /*
+        * Add descendent datasets to work queue.
+        */
+       if (ds->ds_phys->ds_next_snap_obj != 0) {
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+                   ds->ds_phys->ds_creation_txg, tx) == 0);
+       }
+       if (ds->ds_phys->ds_num_children > 1) {
+               boolean_t usenext = B_FALSE;
+               if (ds->ds_phys->ds_next_clones_obj != 0) {
+                       uint64_t count;
+                       /*
+                        * A bug in a previous version of the code could
+                        * cause upgrade_clones_cb() to not set
+                        * ds_next_snap_obj when it should, leading to a
+                        * missing entry.  Therefore we can only use the
+                        * next_clones_obj when its count is correct.
+                        */
+                       int err = zap_count(dp->dp_meta_objset,
+                           ds->ds_phys->ds_next_clones_obj, &count);
+                       if (err == 0 &&
+                           count == ds->ds_phys->ds_num_children - 1)
+                               usenext = B_TRUE;
+               }
+
+               if (usenext) {
+                       VERIFY(zap_join_key(dp->dp_meta_objset,
+                           ds->ds_phys->ds_next_clones_obj,
+                           scn->scn_phys.scn_queue_obj,
+                           ds->ds_phys->ds_creation_txg, tx) == 0);
+               } else {
+                       struct enqueue_clones_arg eca;
+                       eca.tx = tx;
+                       eca.originobj = ds->ds_object;
+
+                       (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+                           NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+               }
+       }
+
+out:
+       dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+       dmu_tx_t *tx = arg;
+       dsl_dataset_t *ds;
+       int err;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       if (err)
+               return (err);
+
+       while (ds->ds_phys->ds_prev_snap_obj != 0) {
+               dsl_dataset_t *prev;
+               err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+                   FTAG, &prev);
+               if (err) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (err);
+               }
+
+               /*
+                * If this is a clone, we don't need to worry about it for now.
+                */
+               if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+                       dsl_dataset_rele(ds, FTAG);
+                       dsl_dataset_rele(prev, FTAG);
+                       return (0);
+               }
+               dsl_dataset_rele(ds, FTAG);
+               ds = prev;
+       }
+
+       VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+       ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+       ddt_entry_t dde = { 0 };
+       int error;
+       uint64_t n = 0;
+
+       while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+               ddt_t *ddt;
+
+               if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+                       break;
+               dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+                   (longlong_t)ddb->ddb_class,
+                   (longlong_t)ddb->ddb_type,
+                   (longlong_t)ddb->ddb_checksum,
+                   (longlong_t)ddb->ddb_cursor);
+
+               /* There should be no pending changes to the dedup table */
+               ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+               ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+               dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+               n++;
+
+               if (dsl_scan_check_pause(scn, NULL))
+                       break;
+       }
+
+       zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+           (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+           (int)scn->scn_pausing);
+
+       ASSERT(error == 0 || error == ENOENT);
+       ASSERT(error != ENOENT ||
+           ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+       const ddt_key_t *ddk = &dde->dde_key;
+       ddt_phys_t *ddp = dde->dde_phys;
+       blkptr_t bp;
+       zbookmark_t zb = { 0 };
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (ddp->ddp_phys_birth == 0 ||
+                   ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+                       continue;
+               ddt_bp_create(checksum, ddk, ddp, &bp);
+
+               scn->scn_visited_this_txg++;
+               scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+       }
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+           scn->scn_phys.scn_ddt_class_max) {
+               scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+               scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+               dsl_scan_ddt(scn, tx);
+               if (scn->scn_pausing)
+                       return;
+       }
+
+       if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+               /* First do the MOS & ORIGIN */
+
+               scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+               scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+               dsl_scan_visit_rootbp(scn, NULL,
+                   &dp->dp_meta_rootbp, tx);
+               spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+               if (scn->scn_pausing)
+                       return;
+
+               if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+                       VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+                           NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+               } else {
+                       dsl_scan_visitds(scn,
+                           dp->dp_origin_snap->ds_object, tx);
+               }
+               ASSERT(!scn->scn_pausing);
+       } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+           ZB_DESTROYED_OBJSET) {
+               /*
+                * If we were paused, continue from here.  Note if the
+                * ds we were paused on was deleted, the zb_objset may
+                * be -1, so we will skip this and find a new objset
+                * below.
+                */
+               dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+               if (scn->scn_pausing)
+                       return;
+       }
+
+       /*
+        * In case we were paused right at the end of the ds, zero the
+        * bookmark so we don't think that we're still trying to resume.
+        */
+       bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+
+       /* keep pulling things out of the zap-object-as-queue */
+       while (zap_cursor_init(&zc, dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj),
+           zap_cursor_retrieve(&zc, &za) == 0) {
+               dsl_dataset_t *ds;
+               uint64_t dsobj;
+
+               dsobj = strtonum(za.za_name, NULL);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+               /* Set up min/max txg */
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+               if (za.za_first_integer != 0) {
+                       scn->scn_phys.scn_cur_min_txg =
+                           MAX(scn->scn_phys.scn_min_txg,
+                           za.za_first_integer);
+               } else {
+                       scn->scn_phys.scn_cur_min_txg =
+                           MAX(scn->scn_phys.scn_min_txg,
+                           ds->ds_phys->ds_prev_snap_txg);
+               }
+               scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+               dsl_dataset_rele(ds, FTAG);
+
+               dsl_scan_visitds(scn, dsobj, tx);
+               zap_cursor_fini(&zc);
+               if (scn->scn_pausing)
+                       return;
+       }
+       zap_cursor_fini(&zc);
+}
+
+static int
+dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg;
+       uint64_t elapsed_nanosecs;
+
+       elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+
+       if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+           (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+           txg_sync_waiting(scn->scn_dp)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa))
+               return (ERESTART);
+
+       zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+           dmu_tx_get_txg(tx), bp, 0));
+       dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+           -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+           -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+       scn->scn_visited_this_txg++;
+       return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+       spa_t *spa = scn->scn_dp->dp_spa;
+       uint64_t used = 0, comp, uncomp;
+
+       if (spa->spa_load_state != SPA_LOAD_NONE)
+               return (B_FALSE);
+       if (spa_shutting_down(spa))
+               return (B_FALSE);
+
+       if (scn->scn_phys.scn_state == DSS_SCANNING)
+               return (B_TRUE);
+
+       if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+               (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+                   &used, &comp, &uncomp);
+       }
+       return (used != 0);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = dp->dp_scan;
+       spa_t *spa = dp->dp_spa;
+       int err;
+
+       /*
+        * Check for scn_restart_txg before checking spa_load_state, so
+        * that we can restart an old-style scan while the pool is being
+        * imported (see dsl_scan_init).
+        */
+       if (scn->scn_restart_txg != 0 &&
+           scn->scn_restart_txg <= tx->tx_txg) {
+               pool_scan_func_t func = POOL_SCAN_SCRUB;
+               dsl_scan_done(scn, B_FALSE, tx);
+               if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+                       func = POOL_SCAN_RESILVER;
+               zfs_dbgmsg("restarting scan func=%u txg=%llu",
+                   func, tx->tx_txg);
+               dsl_scan_setup_sync(scn, &func, tx);
+       }
+
+
+       if (!dsl_scan_active(scn) ||
+           spa_sync_pass(dp->dp_spa) > 1)
+               return;
+
+       scn->scn_visited_this_txg = 0;
+       scn->scn_pausing = B_FALSE;
+       scn->scn_sync_start_time = gethrtime();
+       spa->spa_scrub_active = B_TRUE;
+
+       /*
+        * First process the free list.  If we pause the free, don't do
+        * any scanning.  This ensures that there is no free list when
+        * we are scanning, so the scan code doesn't have to worry about
+        * traversing it.
+        */
+       if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                   NULL, ZIO_FLAG_MUSTSUCCEED);
+               err = bpobj_iterate(&dp->dp_free_bpobj,
+                   dsl_scan_free_cb, scn, tx);
+               VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+               if (scn->scn_visited_this_txg) {
+                       zfs_dbgmsg("freed %llu blocks in %llums from "
+                           "free_bpobj txg %llu",
+                           (longlong_t)scn->scn_visited_this_txg,
+                           (longlong_t)
+                           (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+                           (longlong_t)tx->tx_txg);
+                       scn->scn_visited_this_txg = 0;
+                       /*
+                        * Re-sync the ddt so that we can further modify
+                        * it when doing bprewrite.
+                        */
+                       ddt_sync(spa, tx->tx_txg);
+               }
+               if (err == ERESTART)
+                       return;
+       }
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+
+       if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+           scn->scn_phys.scn_ddt_class_max) {
+               zfs_dbgmsg("doing scan sync txg %llu; "
+                   "ddt bm=%llu/%llu/%llu/%llx",
+                   (longlong_t)tx->tx_txg,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+       } else {
+               zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+                   (longlong_t)tx->tx_txg,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+       }
+
+       scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+           NULL, ZIO_FLAG_CANFAIL);
+       dsl_scan_visit(scn, tx);
+       (void) zio_wait(scn->scn_zio_root);
+       scn->scn_zio_root = NULL;
+
+       zfs_dbgmsg("visited %llu blocks in %llums",
+           (longlong_t)scn->scn_visited_this_txg,
+           (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+
+       if (!scn->scn_pausing) {
+               /* finished with scan. */
+               zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+               dsl_scan_done(scn, B_TRUE, tx);
+       }
+
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+               mutex_enter(&spa->spa_scrub_lock);
+               while (spa->spa_scrub_inflight > 0) {
+                       cv_wait(&spa->spa_scrub_io_cv,
+                           &spa->spa_scrub_lock);
+               }
+               mutex_exit(&spa->spa_scrub_lock);
+       }
+
+       dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+       if (txg == 0) {
+               dmu_tx_t *tx;
+               tx = dmu_tx_create_dd(dp->dp_mos_dir);
+               VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+               txg = dmu_tx_get_txg(tx);
+               dp->dp_scan->scn_restart_txg = txg;
+               dmu_tx_commit(tx);
+       } else {
+               dp->dp_scan->scn_restart_txg = txg;
+       }
+       zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+       return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+           dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+       int i;
+
+       /*
+        * If we resume after a reboot, zab will be NULL; don't record
+        * incomplete stats in that case.
+        */
+       if (zab == NULL)
+               return;
+
+       for (i = 0; i < 4; i++) {
+               int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+               int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+               zfs_blkstat_t *zb = &zab->zab_type[l][t];
+               int equal;
+
+               zb->zb_count++;
+               zb->zb_asize += BP_GET_ASIZE(bp);
+               zb->zb_lsize += BP_GET_LSIZE(bp);
+               zb->zb_psize += BP_GET_PSIZE(bp);
+               zb->zb_gangs += BP_COUNT_GANG(bp);
+
+               switch (BP_GET_NDVAS(bp)) {
+               case 2:
+                       if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[1]))
+                               zb->zb_ditto_2_of_2_samevdev++;
+                       break;
+               case 3:
+                       equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[1])) +
+                           (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[2])) +
+                           (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[2]));
+                       if (equal == 1)
+                               zb->zb_ditto_2_of_3_samevdev++;
+                       else if (equal == 3)
+                               zb->zb_ditto_3_of_3_samevdev++;
+                       break;
+               }
+       }
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+       spa_t *spa = zio->io_spa;
+
+       zio_data_buf_free(zio->io_data, zio->io_size);
+
+       mutex_enter(&spa->spa_scrub_lock);
+       spa->spa_scrub_inflight--;
+       cv_broadcast(&spa->spa_scrub_io_cv);
+
+       if (zio->io_error && (zio->io_error != ECKSUM ||
+           !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+               spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+       }
+       mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_t *zb)
+{
+       dsl_scan_t *scn = dp->dp_scan;
+       size_t size = BP_GET_PSIZE(bp);
+       spa_t *spa = dp->dp_spa;
+       uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+       boolean_t needs_io;
+       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+       int zio_priority;
+
+       if (phys_birth <= scn->scn_phys.scn_min_txg ||
+           phys_birth >= scn->scn_phys.scn_max_txg)
+               return (0);
+
+       count_block(dp->dp_blkstats, bp);
+
+       ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+       if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+               zio_flags |= ZIO_FLAG_SCRUB;
+               zio_priority = ZIO_PRIORITY_SCRUB;
+               needs_io = B_TRUE;
+       } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+               zio_flags |= ZIO_FLAG_RESILVER;
+               zio_priority = ZIO_PRIORITY_RESILVER;
+               needs_io = B_FALSE;
+       }
+
+       /* If it's an intent log block, failure is expected. */
+       if (zb->zb_level == ZB_ZIL_LEVEL)
+               zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+       for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+               vdev_t *vd = vdev_lookup_top(spa,
+                   DVA_GET_VDEV(&bp->blk_dva[d]));
+
+               /*
+                * Keep track of how much data we've examined so that
+                * zpool(1M) status can make useful progress reports.
+                */
+               scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+               spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+               /* if it's a resilver, this may not be in the target range */
+               if (!needs_io) {
+                       if (DVA_GET_GANG(&bp->blk_dva[d])) {
+                               /*
+                                * Gang members may be spread across multiple
+                                * vdevs, so the best estimate we have is the
+                                * scrub range, which has already been checked.
+                                * XXX -- it would be better to change our
+                                * allocation policy to ensure that all
+                                * gang members reside on the same vdev.
+                                */
+                               needs_io = B_TRUE;
+                       } else {
+                               needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+                                   phys_birth, 1);
+                       }
+               }
+       }
+
+       if (needs_io && !zfs_no_scrub_io) {
+               void *data = zio_data_buf_alloc(size);
+
+               mutex_enter(&spa->spa_scrub_lock);
+               while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+                       cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+               spa->spa_scrub_inflight++;
+               mutex_exit(&spa->spa_scrub_lock);
+
+               zio_nowait(zio_read(NULL, spa, bp, data, size,
+                   dsl_scan_scrub_done, NULL, zio_priority,
+                   zio_flags, zb));
+       }
+
+       /* do not relocate this block */
+       return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+       spa_t *spa = dp->dp_spa;
+
+       /*
+        * Purge all vdev caches and probe all devices.  We do this here
+        * rather than in sync context because this requires a writer lock
+        * on the spa_config lock, which we can't do from sync context.  The
+        * spa_scrub_reopen flag indicates that vdev_open() should not
+        * attempt to start another scrub.
+        */
+       spa_vdev_state_enter(spa, SCL_NONE);
+       spa->spa_scrub_reopen = B_TRUE;
+       vdev_reopen(spa->spa_root_vdev);
+       spa->spa_scrub_reopen = B_FALSE;
+       (void) spa_vdev_state_exit(spa, NULL, 0);
+
+       return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+           dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+}
diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c
deleted file mode 100644 (file)
index 03ebb90..0000000
+++ /dev/null
@@ -1,1043 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-
-typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-
-static scrub_cb_t dsl_pool_scrub_clean_cb;
-static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
-static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
-    uint64_t objset, uint64_t object);
-
-int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
-int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-
-extern int zfs_txg_timeout;
-
-static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
-       NULL,
-       dsl_pool_scrub_clean_cb
-};
-
-#define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-       (zb)->zb_objset = objset;                       \
-       (zb)->zb_object = object;                       \
-       (zb)->zb_level = level;                         \
-       (zb)->zb_blkid = blkid;                         \
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-       dsl_pool_t *dp = arg1;
-       enum scrub_func *funcp = arg2;
-       dmu_object_type_t ot = 0;
-       boolean_t complete = B_FALSE;
-
-       dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
-
-       ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
-       ASSERT(*funcp > SCRUB_FUNC_NONE);
-       ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
-
-       dp->dp_scrub_min_txg = 0;
-       dp->dp_scrub_max_txg = tx->tx_txg;
-
-       if (*funcp == SCRUB_FUNC_CLEAN) {
-               vdev_t *rvd = dp->dp_spa->spa_root_vdev;
-
-               /* rewrite all disk labels */
-               vdev_config_dirty(rvd);
-
-               if (vdev_resilver_needed(rvd,
-                   &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
-                       spa_event_notify(dp->dp_spa, NULL,
-                           ESC_ZFS_RESILVER_START);
-                       dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
-                           tx->tx_txg);
-               } else {
-                       spa_event_notify(dp->dp_spa, NULL,
-                           ESC_ZFS_SCRUB_START);
-               }
-
-               /* zero out the scrub stats in all vdev_stat_t's */
-               vdev_scrub_stat_update(rvd,
-                   dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
-                   POOL_SCRUB_EVERYTHING, B_FALSE);
-
-               dp->dp_spa->spa_scrub_started = B_TRUE;
-       }
-
-       /* back to the generic stuff */
-
-       if (dp->dp_blkstats == NULL) {
-               dp->dp_blkstats =
-                   kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
-       }
-       bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-
-       if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
-               ot = DMU_OT_ZAP_OTHER;
-
-       dp->dp_scrub_func = *funcp;
-       dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
-           ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
-       bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-       dp->dp_scrub_restart = B_FALSE;
-       dp->dp_spa->spa_scrub_errors = 0;
-
-       VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-           &dp->dp_scrub_func, tx));
-       VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-           &dp->dp_scrub_queue_obj, tx));
-       VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-           &dp->dp_scrub_min_txg, tx));
-       VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-           &dp->dp_scrub_max_txg, tx));
-       VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
-           &dp->dp_scrub_bookmark, tx));
-       VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-           &dp->dp_spa->spa_scrub_errors, tx));
-
-       spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
-           "func=%u mintxg=%llu maxtxg=%llu",
-           *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
-}
-
-int
-dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
-{
-       return (dsl_sync_task_do(dp, NULL,
-           dsl_pool_scrub_setup_sync, dp, &func, 0));
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-       dsl_pool_t *dp = arg1;
-       boolean_t *completep = arg2;
-
-       if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-               return;
-
-       mutex_enter(&dp->dp_scrub_cancel_lock);
-
-       if (dp->dp_scrub_restart) {
-               dp->dp_scrub_restart = B_FALSE;
-               *completep = B_FALSE;
-       }
-
-       /* XXX this is scrub-clean specific */
-       mutex_enter(&dp->dp_spa->spa_scrub_lock);
-       while (dp->dp_spa->spa_scrub_inflight > 0) {
-               cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-                   &dp->dp_spa->spa_scrub_lock);
-       }
-       mutex_exit(&dp->dp_spa->spa_scrub_lock);
-       dp->dp_spa->spa_scrub_started = B_FALSE;
-       dp->dp_spa->spa_scrub_active = B_FALSE;
-
-       dp->dp_scrub_func = SCRUB_FUNC_NONE;
-       VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
-           dp->dp_scrub_queue_obj, tx));
-       dp->dp_scrub_queue_obj = 0;
-       bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
-       VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_QUEUE, tx));
-       VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_MIN_TXG, tx));
-       VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_MAX_TXG, tx));
-       VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_BOOKMARK, tx));
-       VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_FUNC, tx));
-       VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_ERRORS, tx));
-
-       spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
-           "complete=%u", *completep);
-
-       /* below is scrub-clean specific */
-       vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
-           *completep);
-       /*
-        * If the scrub/resilver completed, update all DTLs to reflect this.
-        * Whether it succeeded or not, vacate all temporary scrub DTLs.
-        */
-       vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
-           *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
-       if (*completep)
-               spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
-                   ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
-       spa_errlog_rotate(dp->dp_spa);
-
-       /*
-        * We may have finished replacing a device.
-        * Let the async thread assess this and handle the detach.
-        */
-       spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
-
-       dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
-       mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-int
-dsl_pool_scrub_cancel(dsl_pool_t *dp)
-{
-       boolean_t complete = B_FALSE;
-
-       return (dsl_sync_task_do(dp, NULL,
-           dsl_pool_scrub_cancel_sync, dp, &complete, 3));
-}
-
-int
-dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
-       /*
-        * This function will be used by bp-rewrite wad to intercept frees.
-        */
-       return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
-           done, private, arc_flags));
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
-       return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-           zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-       uint64_t zb1nextL0, zb2thisobj;
-
-       ASSERT(zb1->zb_objset == zb2->zb_objset);
-       ASSERT(zb1->zb_object != -1ULL);
-       ASSERT(zb2->zb_level == 0);
-
-       /*
-        * A bookmark in the deadlist is considered to be after
-        * everything else.
-        */
-       if (zb2->zb_object == -1ULL)
-               return (B_TRUE);
-
-       /* The objset_phys_t isn't before anything. */
-       if (dnp == NULL)
-               return (B_FALSE);
-
-       zb1nextL0 = (zb1->zb_blkid + 1) <<
-           ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
-       zb2thisobj = zb2->zb_object ? zb2->zb_object :
-           zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
-       if (zb1->zb_object == 0) {
-               uint64_t nextobj = zb1nextL0 *
-                   (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-               return (nextobj <= zb2thisobj);
-       }
-
-       if (zb1->zb_object < zb2thisobj)
-               return (B_TRUE);
-       if (zb1->zb_object > zb2thisobj)
-               return (B_FALSE);
-       if (zb2->zb_object == 0)
-               return (B_FALSE);
-       return (zb1nextL0 <= zb2->zb_blkid);
-}
-
-static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
-{
-       int elapsed_ticks;
-       int mintime;
-
-       if (dp->dp_scrub_pausing)
-               return (B_TRUE); /* we're already pausing */
-
-       if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
-               return (B_FALSE); /* we're resuming */
-
-       /* We only know how to resume from level-0 blocks. */
-       if (zb->zb_level != 0)
-               return (B_FALSE);
-
-       mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
-           zfs_scrub_min_time;
-       elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
-       if (elapsed_ticks > hz * zfs_txg_timeout ||
-           (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
-               dprintf("pausing at %llx/%llx/%llx/%llx\n",
-                   (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
-                   (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
-               dp->dp_scrub_pausing = B_TRUE;
-               dp->dp_scrub_bookmark = *zb;
-               return (B_TRUE);
-       }
-       return (B_FALSE);
-}
-
-typedef struct zil_traverse_arg {
-       dsl_pool_t      *zta_dp;
-       zil_header_t    *zta_zh;
-} zil_traverse_arg_t;
-
-/* ARGSUSED */
-static void
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-       zil_traverse_arg_t *zta = arg;
-       dsl_pool_t *dp = zta->zta_dp;
-       zil_header_t *zh = zta->zta_zh;
-       zbookmark_t zb;
-
-       if (bp->blk_birth <= dp->dp_scrub_min_txg)
-               return;
-
-       /*
-        * One block ("stubby") can be allocated a long time ago; we
-        * want to visit that one because it has been allocated
-        * (on-disk) even if it hasn't been claimed (even though for
-        * plain scrub there's nothing to do to it).
-        */
-       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
-               return;
-
-       zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-       zb.zb_object = 0;
-       zb.zb_level = -1;
-       zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-       VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
-       if (lrc->lrc_txtype == TX_WRITE) {
-               zil_traverse_arg_t *zta = arg;
-               dsl_pool_t *dp = zta->zta_dp;
-               zil_header_t *zh = zta->zta_zh;
-               lr_write_t *lr = (lr_write_t *)lrc;
-               blkptr_t *bp = &lr->lr_blkptr;
-               zbookmark_t zb;
-
-               if (bp->blk_birth <= dp->dp_scrub_min_txg)
-                       return;
-
-               /*
-                * birth can be < claim_txg if this record's txg is
-                * already txg sync'ed (but this log block contains
-                * other records that are not synced)
-                */
-               if (claim_txg == 0 || bp->blk_birth < claim_txg)
-                       return;
-
-               zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-               zb.zb_object = lr->lr_foid;
-               zb.zb_level = BP_GET_LEVEL(bp);
-               zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-               VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-       }
-}
-
-static void
-traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
-       uint64_t claim_txg = zh->zh_claim_txg;
-       zil_traverse_arg_t zta = { dp, zh };
-       zilog_t *zilog;
-
-       /*
-        * We only want to visit blocks that have been claimed but not yet
-        * replayed (or, in read-only mode, blocks that *would* be claimed).
-        */
-       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
-               return;
-
-       zilog = zil_alloc(dp->dp_meta_objset, zh);
-
-       (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
-           claim_txg);
-
-       zil_free(zilog);
-}
-
-static void
-scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
-    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
-{
-       int err;
-       arc_buf_t *buf = NULL;
-
-       if (bp->blk_birth <= dp->dp_scrub_min_txg)
-               return;
-
-       if (scrub_pause(dp, zb))
-               return;
-
-       if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
-               /*
-                * If we already visited this bp & everything below (in
-                * a prior txg), don't bother doing it again.
-                */
-               if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
-                       return;
-
-               /*
-                * If we found the block we're trying to resume from, or
-                * we went past it to a different object, zero it out to
-                * indicate that it's OK to start checking for pausing
-                * again.
-                */
-               if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
-                   zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
-                       dprintf("resuming at %llx/%llx/%llx/%llx\n",
-                           (longlong_t)zb->zb_objset,
-                           (longlong_t)zb->zb_object,
-                           (longlong_t)zb->zb_level,
-                           (longlong_t)zb->zb_blkid);
-                       bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
-               }
-       }
-
-       if (BP_GET_LEVEL(bp) > 0) {
-               uint32_t flags = ARC_WAIT;
-               int i;
-               blkptr_t *cbp;
-               int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
-               err = arc_read(NULL, dp->dp_spa, bp, pbuf,
-                   arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-               if (err) {
-                       mutex_enter(&dp->dp_spa->spa_scrub_lock);
-                       dp->dp_spa->spa_scrub_errors++;
-                       mutex_exit(&dp->dp_spa->spa_scrub_lock);
-                       return;
-               }
-               cbp = buf->b_data;
-
-               for (i = 0; i < epb; i++, cbp++) {
-                       zbookmark_t czb;
-
-                       SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-                           zb->zb_level - 1,
-                           zb->zb_blkid * epb + i);
-                       scrub_visitbp(dp, dnp, buf, cbp, &czb);
-               }
-       } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-               uint32_t flags = ARC_WAIT;
-               dnode_phys_t *child_dnp;
-               int i;
-               int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
-               err = arc_read(NULL, dp->dp_spa, bp, pbuf,
-                   arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-               if (err) {
-                       mutex_enter(&dp->dp_spa->spa_scrub_lock);
-                       dp->dp_spa->spa_scrub_errors++;
-                       mutex_exit(&dp->dp_spa->spa_scrub_lock);
-                       return;
-               }
-               child_dnp = buf->b_data;
-
-               for (i = 0; i < epb; i++, child_dnp++) {
-                       scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
-                           zb->zb_blkid * epb + i);
-               }
-       } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-               uint32_t flags = ARC_WAIT;
-               objset_phys_t *osp;
-
-               err = arc_read_nolock(NULL, dp->dp_spa, bp,
-                   arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-               if (err) {
-                       mutex_enter(&dp->dp_spa->spa_scrub_lock);
-                       dp->dp_spa->spa_scrub_errors++;
-                       mutex_exit(&dp->dp_spa->spa_scrub_lock);
-                       return;
-               }
-
-               osp = buf->b_data;
-
-               traverse_zil(dp, &osp->os_zil_header);
-
-               scrub_visitdnode(dp, &osp->os_meta_dnode,
-                   buf, zb->zb_objset, 0);
-               if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       scrub_visitdnode(dp, &osp->os_userused_dnode,
-                           buf, zb->zb_objset, 0);
-                       scrub_visitdnode(dp, &osp->os_groupused_dnode,
-                           buf, zb->zb_objset, 0);
-               }
-       }
-
-       (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
-       if (buf)
-               (void) arc_buf_remove_ref(buf, &buf);
-}
-
-static void
-scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
-    uint64_t objset, uint64_t object)
-{
-       int j;
-
-       for (j = 0; j < dnp->dn_nblkptr; j++) {
-               zbookmark_t czb;
-
-               SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-               scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
-       }
-
-}
-
-static void
-scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
-{
-       zbookmark_t zb;
-
-       SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
-       scrub_visitbp(dp, NULL, NULL, bp, &zb);
-}
-
-void
-dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-       if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-               return;
-
-       if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-               SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
-       } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-           ds->ds_object, tx) != 0) {
-               return;
-       }
-
-       if (ds->ds_phys->ds_next_snap_obj != 0) {
-               VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-                   ds->ds_phys->ds_next_snap_obj, tx) == 0);
-       }
-       ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-}
-
-void
-dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-       if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-               return;
-
-       ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
-
-       if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-               dp->dp_scrub_bookmark.zb_objset =
-                   ds->ds_phys->ds_prev_snap_obj;
-       } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-           ds->ds_object, tx) == 0) {
-               VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-                   ds->ds_phys->ds_prev_snap_obj, tx) == 0);
-       }
-}
-
-void
-dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
-       dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-
-       if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-               return;
-
-       if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
-               dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
-       } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
-               dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
-       }
-
-       if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-           ds1->ds_object, tx) == 0) {
-               int err = zap_add_int(dp->dp_meta_objset,
-                   dp->dp_scrub_queue_obj, ds2->ds_object, tx);
-               VERIFY(err == 0 || err == EEXIST);
-               if (err == EEXIST) {
-                       /* Both were there to begin with */
-                       VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-                           dp->dp_scrub_queue_obj, ds1->ds_object, tx));
-               }
-       } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-           ds2->ds_object, tx) == 0) {
-               VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-                   dp->dp_scrub_queue_obj, ds1->ds_object, tx));
-       }
-}
-
-struct enqueue_clones_arg {
-       dmu_tx_t *tx;
-       uint64_t originobj;
-};
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-       struct enqueue_clones_arg *eca = arg;
-       dsl_dataset_t *ds;
-       int err;
-       dsl_pool_t *dp;
-
-       err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
-       if (err)
-               return (err);
-       dp = ds->ds_dir->dd_pool;
-
-       if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
-               while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
-                       dsl_dataset_t *prev;
-                       err = dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-
-                       dsl_dataset_rele(ds, FTAG);
-                       if (err)
-                               return (err);
-                       ds = prev;
-               }
-               VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-                   ds->ds_object, eca->tx) == 0);
-       }
-       dsl_dataset_rele(ds, FTAG);
-       return (0);
-}
-
-static void
-scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds;
-       uint64_t min_txg_save;
-
-       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
-       /*
-        * Iterate over the bps in this ds.
-        */
-       min_txg_save = dp->dp_scrub_min_txg;
-       dp->dp_scrub_min_txg =
-           MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
-       scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
-       dp->dp_scrub_min_txg = min_txg_save;
-
-       if (dp->dp_scrub_pausing)
-               goto out;
-
-       /*
-        * Add descendent datasets to work queue.
-        */
-       if (ds->ds_phys->ds_next_snap_obj != 0) {
-               VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-                   ds->ds_phys->ds_next_snap_obj, tx) == 0);
-       }
-       if (ds->ds_phys->ds_num_children > 1) {
-               if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-                       struct enqueue_clones_arg eca;
-                       eca.tx = tx;
-                       eca.originobj = ds->ds_object;
-
-                       (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
-                           NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
-               } else {
-                       VERIFY(zap_join(dp->dp_meta_objset,
-                           ds->ds_phys->ds_next_clones_obj,
-                           dp->dp_scrub_queue_obj, tx) == 0);
-               }
-       }
-
-out:
-       dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-       dmu_tx_t *tx = arg;
-       dsl_dataset_t *ds;
-       int err;
-       dsl_pool_t *dp;
-
-       err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
-       if (err)
-               return (err);
-
-       dp = ds->ds_dir->dd_pool;
-
-       while (ds->ds_phys->ds_prev_snap_obj != 0) {
-               dsl_dataset_t *prev;
-               err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-                   FTAG, &prev);
-               if (err) {
-                       dsl_dataset_rele(ds, FTAG);
-                       return (err);
-               }
-
-               /*
-                * If this is a clone, we don't need to worry about it for now.
-                */
-               if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-                       dsl_dataset_rele(ds, FTAG);
-                       dsl_dataset_rele(prev, FTAG);
-                       return (0);
-               }
-               dsl_dataset_rele(ds, FTAG);
-               ds = prev;
-       }
-
-       VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-           ds->ds_object, tx) == 0);
-       dsl_dataset_rele(ds, FTAG);
-       return (0);
-}
-
-void
-dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-       spa_t *spa = dp->dp_spa;
-       zap_cursor_t zc;
-       zap_attribute_t za;
-       boolean_t complete = B_TRUE;
-
-       if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-               return;
-
-       /*
-        * If the pool is not loaded, or is trying to unload, leave it alone.
-        */
-       if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
-               return;
-
-       if (dp->dp_scrub_restart) {
-               enum scrub_func func = dp->dp_scrub_func;
-               dp->dp_scrub_restart = B_FALSE;
-               dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
-       }
-
-       if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
-               /*
-                * We must have resumed after rebooting; reset the vdev
-                * stats to know that we're doing a scrub (although it
-                * will think we're just starting now).
-                */
-               vdev_scrub_stat_update(spa->spa_root_vdev,
-                   dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
-                   POOL_SCRUB_EVERYTHING, B_FALSE);
-       }
-
-       dp->dp_scrub_pausing = B_FALSE;
-       dp->dp_scrub_start_time = lbolt64;
-       dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-       spa->spa_scrub_active = B_TRUE;
-
-       if (dp->dp_scrub_bookmark.zb_objset == 0) {
-               /* First do the MOS & ORIGIN */
-               scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
-               if (dp->dp_scrub_pausing)
-                       goto out;
-
-               if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-                       VERIFY(0 == dmu_objset_find_spa(spa,
-                           NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
-               } else {
-                       scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
-               }
-               ASSERT(!dp->dp_scrub_pausing);
-       } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
-               /*
-                * If we were paused, continue from here.  Note if the
-                * ds we were paused on was deleted, the zb_objset will
-                * be -1, so we will skip this and find a new objset
-                * below.
-                */
-               scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
-               if (dp->dp_scrub_pausing)
-                       goto out;
-       }
-
-       /*
-        * In case we were paused right at the end of the ds, zero the
-        * bookmark so we don't think that we're still trying to resume.
-        */
-       bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
-       /* keep pulling things out of the zap-object-as-queue */
-       while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
-           zap_cursor_retrieve(&zc, &za) == 0) {
-               VERIFY(0 == zap_remove(dp->dp_meta_objset,
-                   dp->dp_scrub_queue_obj, za.za_name, tx));
-               scrub_visitds(dp, za.za_first_integer, tx);
-               if (dp->dp_scrub_pausing)
-                       break;
-               zap_cursor_fini(&zc);
-       }
-       zap_cursor_fini(&zc);
-       if (dp->dp_scrub_pausing)
-               goto out;
-
-       /* done. */
-
-       dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
-       return;
-out:
-       VERIFY(0 == zap_update(dp->dp_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
-           &dp->dp_scrub_bookmark, tx));
-       VERIFY(0 == zap_update(dp->dp_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-           &spa->spa_scrub_errors, tx));
-
-       /* XXX this is scrub-clean specific */
-       mutex_enter(&spa->spa_scrub_lock);
-       while (spa->spa_scrub_inflight > 0)
-               cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-       mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-dsl_pool_scrub_restart(dsl_pool_t *dp)
-{
-       mutex_enter(&dp->dp_scrub_cancel_lock);
-       dp->dp_scrub_restart = B_TRUE;
-       mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-/*
- * scrub consumers
- */
-
-static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
-       int i;
-
-       /*
-        * If we resume after a reboot, zab will be NULL; don't record
-        * incomplete stats in that case.
-        */
-       if (zab == NULL)
-               return;
-
-       for (i = 0; i < 4; i++) {
-               int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
-               int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
-               zfs_blkstat_t *zb = &zab->zab_type[l][t];
-               int equal;
-
-               zb->zb_count++;
-               zb->zb_asize += BP_GET_ASIZE(bp);
-               zb->zb_lsize += BP_GET_LSIZE(bp);
-               zb->zb_psize += BP_GET_PSIZE(bp);
-               zb->zb_gangs += BP_COUNT_GANG(bp);
-
-               switch (BP_GET_NDVAS(bp)) {
-               case 2:
-                       if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-                           DVA_GET_VDEV(&bp->blk_dva[1]))
-                               zb->zb_ditto_2_of_2_samevdev++;
-                       break;
-               case 3:
-                       equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-                           DVA_GET_VDEV(&bp->blk_dva[1])) +
-                           (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-                           DVA_GET_VDEV(&bp->blk_dva[2])) +
-                           (DVA_GET_VDEV(&bp->blk_dva[1]) ==
-                           DVA_GET_VDEV(&bp->blk_dva[2]));
-                       if (equal == 1)
-                               zb->zb_ditto_2_of_3_samevdev++;
-                       else if (equal == 3)
-                               zb->zb_ditto_3_of_3_samevdev++;
-                       break;
-               }
-       }
-}
-
-static void
-dsl_pool_scrub_clean_done(zio_t *zio)
-{
-       spa_t *spa = zio->io_spa;
-
-       zio_data_buf_free(zio->io_data, zio->io_size);
-
-       mutex_enter(&spa->spa_scrub_lock);
-       spa->spa_scrub_inflight--;
-       cv_broadcast(&spa->spa_scrub_io_cv);
-
-       if (zio->io_error && (zio->io_error != ECKSUM ||
-           !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
-               spa->spa_scrub_errors++;
-       mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
-    const blkptr_t *bp, const zbookmark_t *zb)
-{
-       size_t size = BP_GET_PSIZE(bp);
-       spa_t *spa = dp->dp_spa;
-       boolean_t needs_io;
-       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-       int zio_priority;
-
-       ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
-
-       if (bp->blk_birth >= dp->dp_scrub_max_txg)
-               return (0);
-
-       count_block(dp->dp_blkstats, bp);
-
-       if (dp->dp_scrub_isresilver == 0) {
-               /* It's a scrub */
-               zio_flags |= ZIO_FLAG_SCRUB;
-               zio_priority = ZIO_PRIORITY_SCRUB;
-               needs_io = B_TRUE;
-       } else {
-               /* It's a resilver */
-               zio_flags |= ZIO_FLAG_RESILVER;
-               zio_priority = ZIO_PRIORITY_RESILVER;
-               needs_io = B_FALSE;
-       }
-
-       /* If it's an intent log block, failure is expected. */
-       if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
-               zio_flags |= ZIO_FLAG_SPECULATIVE;
-
-       for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
-               vdev_t *vd = vdev_lookup_top(spa,
-                   DVA_GET_VDEV(&bp->blk_dva[d]));
-
-               /*
-                * Keep track of how much data we've examined so that
-                * zpool(1M) status can make useful progress reports.
-                */
-               mutex_enter(&vd->vdev_stat_lock);
-               vd->vdev_stat.vs_scrub_examined +=
-                   DVA_GET_ASIZE(&bp->blk_dva[d]);
-               mutex_exit(&vd->vdev_stat_lock);
-
-               /* if it's a resilver, this may not be in the target range */
-               if (!needs_io) {
-                       if (DVA_GET_GANG(&bp->blk_dva[d])) {
-                               /*
-                                * Gang members may be spread across multiple
-                                * vdevs, so the best estimate we have is the
-                                * scrub range, which has already been checked.
-                                * XXX -- it would be better to change our
-                                * allocation policy to ensure that all
-                                * gang members reside on the same vdev.
-                                */
-                               needs_io = B_TRUE;
-                       } else {
-                               needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-                                   bp->blk_birth, 1);
-                       }
-               }
-       }
-
-       if (needs_io && !zfs_no_scrub_io) {
-               void *data = zio_data_buf_alloc(size);
-
-               mutex_enter(&spa->spa_scrub_lock);
-               while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
-                       cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-               spa->spa_scrub_inflight++;
-               mutex_exit(&spa->spa_scrub_lock);
-
-               zio_nowait(zio_read(NULL, spa, bp, data, size,
-                   dsl_pool_scrub_clean_done, NULL, zio_priority,
-                   zio_flags, zb));
-       }
-
-       /* do not relocate this block */
-       return (0);
-}
-
-int
-dsl_pool_scrub_clean(dsl_pool_t *dp)
-{
-       spa_t *spa = dp->dp_spa;
-
-       /*
-        * Purge all vdev caches.  We do this here rather than in sync
-        * context because this requires a writer lock on the spa_config
-        * lock, which we can't do from sync context.  The
-        * spa_scrub_reopen flag indicates that vdev_open() should not
-        * attempt to start another scrub.
-        */
-       spa_vdev_state_enter(spa);
-       spa->spa_scrub_reopen = B_TRUE;
-       vdev_reopen(spa->spa_root_vdev);
-       spa->spa_scrub_reopen = B_FALSE;
-       (void) spa_vdev_state_exit(spa, NULL, 0);
-
-       return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
-}
index 2110022..832685b 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
-#include <sys/cred.h>
+#include <sys/metaslab.h>
 
 #define        DST_AVG_BLKSHIFT 14
 
@@ -50,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
        list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
            offsetof(dsl_sync_task_t, dst_node));
        dstg->dstg_pool = dp;
-       dstg->dstg_cr = CRED();
 
        return (dstg);
 }
@@ -112,14 +108,21 @@ top:
                return (dstg->dstg_err);
        }
 
-       VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+       /*
+        * We don't generally have many sync tasks, so pay the price of
+        * add_tail to get the tasks executed in the right order.
+        */
+       VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+           dstg, txg));
 
        dmu_tx_commit(tx);
 
        txg_wait_synced(dstg->dstg_pool, txg);
 
-       if (dstg->dstg_err == EAGAIN)
+       if (dstg->dstg_err == EAGAIN) {
+               txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
                goto top;
+       }
 
        return (dstg->dstg_err);
 }
@@ -131,7 +134,12 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 
        dstg->dstg_nowaiter = B_TRUE;
        txg = dmu_tx_get_txg(tx);
-       VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+       /*
+        * We don't generally have many sync tasks, so pay the price of
+        * add_tail to get the tasks executed in the right order.
+        */
+       VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+           dstg, txg));
 }
 
 void
@@ -150,25 +158,30 @@ void
 dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 {
        dsl_sync_task_t *dst;
-       void *tr_cookie;
+       dsl_pool_t *dp = dstg->dstg_pool;
+       uint64_t quota, used;
 
        ASSERT3U(dstg->dstg_err, ==, 0);
 
        /*
-        * Check for sufficient space.
+        * Check for sufficient space.  We just check against what's
+        * on-disk; we don't want any in-flight accounting to get in our
+        * way, because open context may have already used up various
+        * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
         */
-       dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
-           dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
-       /* don't bother trying again */
-       if (dstg->dstg_err == ERESTART)
-               dstg->dstg_err = EAGAIN;
-       if (dstg->dstg_err)
+       quota = dsl_pool_adjustedsize(dp, B_FALSE) -
+           metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+       used = dp->dp_root_dir->dd_phys->dd_used_bytes;
+       /* MOS space is triple-dittoed, so we multiply by 3. */
+       if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) {
+               dstg->dstg_err = ENOSPC;
                return;
+       }
 
        /*
         * Check for errors by calling checkfuncs.
         */
-       rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+       rw_enter(&dp->dp_config_rwlock, RW_WRITER);
        for (dst = list_head(&dstg->dstg_tasks); dst;
            dst = list_next(&dstg->dstg_tasks, dst)) {
                dst->dst_err =
@@ -183,13 +196,10 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
                 */
                for (dst = list_head(&dstg->dstg_tasks); dst;
                    dst = list_next(&dstg->dstg_tasks, dst)) {
-                       dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
-                           dstg->dstg_cr, tx);
+                       dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
                }
        }
-       rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
-       dsl_dir_tempreserve_clear(tr_cookie, tx);
+       rw_exit(&dp->dp_config_rwlock);
 
        if (dstg->dstg_nowaiter)
                dsl_sync_task_group_destroy(dstg);
index 3cc979d..78943ed 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -94,6 +93,8 @@ static ulong_t ereport_qlen = 0;
 static size_t ereport_size = 0;
 static int ereport_cols = 80;
 
+extern void fastreboot_disable_highpil(void);
+
 /*
  * Common fault management kstats to record ereport generation
  * failures
@@ -374,6 +375,9 @@ fm_panic(const char *format, ...)
        va_list ap;
 
        (void) casptr((void *)&fm_panicstr, NULL, (void *)format);
+#if defined(__i386) || defined(__amd64)
+       fastreboot_disable_highpil();
+#endif /* __i386 || __amd64 */
        va_start(ap, format);
        vpanic(format, ap);
        va_end(ap);
@@ -512,10 +516,10 @@ fm_ereport_post(nvlist_t *ereport, int evc_flag)
        if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
            SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
                atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
-               sysevent_evc_unbind(error_chan);
+               (void) sysevent_evc_unbind(error_chan);
                return;
        }
-       sysevent_evc_unbind(error_chan);
+       (void) sysevent_evc_unbind(error_chan);
 }
 
 /*
@@ -788,6 +792,14 @@ fm_payload_set(nvlist_t *payload, ...)
  *     detector                nvlist_t        <detector>
  *     ereport-payload         nvlist_t        <var args>
  *
+ * We don't actually add a 'version' member to the payload.  Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else.  Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
  */
 void
 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
@@ -920,46 +932,41 @@ fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
  *     version                 uint8_t         0
  *     auth                    nvlist_t        <auth>
  *     devpath                 string          <devpath>
- *     devid                   string          <devid>
+ *     [devid]                 string          <devid>
+ *     [target-port-l0id]      string          <target-port-lun0-id>
  *
  * Note that auth and devid are optional members.
  */
 void
 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
-    const char *devpath, const char *devid)
+    const char *devpath, const char *devid, const char *tpl0)
 {
+       int err = 0;
+
        if (version != DEV_SCHEME_VERSION0) {
                atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
                return;
        }
 
-       if (nvlist_add_uint8(fmri_dev, FM_VERSION, version) != 0) {
-               atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-               return;
-       }
-
-       if (nvlist_add_string(fmri_dev, FM_FMRI_SCHEME,
-           FM_FMRI_SCHEME_DEV) != 0) {
-               atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-               return;
-       }
+       err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+       err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
 
        if (auth != NULL) {
-               if (nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
-                   (nvlist_t *)auth) != 0) {
-                       atomic_add_64(
-                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-               }
+               err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+                   (nvlist_t *)auth);
        }
 
-       if (nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath) != 0) {
-               atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-       }
+       err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
 
        if (devid != NULL)
-               if (nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid) != 0)
-                       atomic_add_64(
-                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+               err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+       if (tpl0 != NULL)
+               err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+       if (err)
+               atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+
 }
 
 /*
@@ -1264,3 +1271,102 @@ print_msg_hwerr(ctid_t ct_id, proc_t *p)
        uprintf("Killed process %d (%s) in contract id %d "
            "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
 }
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+    nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+       nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+       nvlist_t *pairs[HC_MAXPAIRS];
+       nvlist_t **hcl;
+       uint_t n;
+       int i, j;
+       va_list ap;
+       char *hcname, *hcid;
+
+       if (!fm_fmri_hc_set_common(fmri, version, auth))
+               return;
+
+       /*
+        * copy the bboard nvpairs to the pairs array
+        */
+       if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+           != 0) {
+               atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+               return;
+       }
+
+       for (i = 0; i < n; i++) {
+               if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+                   &hcname) != 0) {
+                       atomic_add_64(
+                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+                       return;
+               }
+               if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+                       atomic_add_64(
+                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+                       return;
+               }
+
+               pairs[i] = fm_nvlist_create(nva);
+               if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+                   nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+                       for (j = 0; j <= i; j++) {
+                               if (pairs[j] != NULL)
+                                       fm_nvlist_destroy(pairs[j],
+                                           FM_NVA_RETAIN);
+                       }
+                       atomic_add_64(
+                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+                       return;
+               }
+       }
+
+       /*
+        * create the pairs from passed in pairs
+        */
+       npairs = MIN(npairs, HC_MAXPAIRS);
+
+       va_start(ap, npairs);
+       for (i = n; i < npairs + n; i++) {
+               const char *name = va_arg(ap, const char *);
+               uint32_t id = va_arg(ap, uint32_t);
+               char idstr[11];
+               (void) snprintf(idstr, sizeof (idstr), "%u", id);
+               pairs[i] = fm_nvlist_create(nva);
+               if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+                   nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+                       for (j = 0; j <= i; j++) {
+                               if (pairs[j] != NULL)
+                                       fm_nvlist_destroy(pairs[j],
+                                           FM_NVA_RETAIN);
+                       }
+                       atomic_add_64(
+                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+                       return;
+               }
+       }
+       va_end(ap);
+
+       /*
+        * Create the fmri hc list
+        */
+       if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+           npairs + n) != 0) {
+               atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+               return;
+       }
+
+       for (i = 0; i < npairs + n; i++) {
+                       fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+       }
+
+       if (snvl != NULL) {
+               if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+                       atomic_add_64(
+                           &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+                       return;
+               }
+       }
+}
index 6e5955b..8f189c6 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_ARC_H
@@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func;
 struct arc_buf {
        arc_buf_hdr_t           *b_hdr;
        arc_buf_t               *b_next;
-       krwlock_t               b_lock;
+       kmutex_t                b_evict_lock;
+       krwlock_t               b_data_lock;
        void                    *b_data;
        arc_evict_func_t        *b_efunc;
        void                    *b_private;
@@ -87,10 +87,13 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
     arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb);
 int arc_released(arc_buf_t *buf);
 int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
@@ -99,28 +102,16 @@ void arc_buf_thaw(arc_buf_t *buf);
 int arc_referenced(arc_buf_t *buf);
 #endif
 
-typedef struct writeprops {
-       dmu_object_type_t wp_type;
-       uint8_t wp_level;
-       uint8_t wp_copies;
-       uint8_t wp_dncompress, wp_oscompress;
-       uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 int arc_buf_evict(arc_buf_t *buf);
index cdb93a6..471be90 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_BPLIST_H
 #define        _SYS_BPLIST_H
 
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
 #include <sys/zfs_context.h>
+#include <sys/spa.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct bplist_phys {
-       /*
-        * This is the bonus buffer for the dead lists.  The object's
-        * contents is an array of bpl_entries blkptr_t's, representing
-        * a total of bpl_bytes physical space.
-        */
-       uint64_t        bpl_entries;
-       uint64_t        bpl_bytes;
-       uint64_t        bpl_comp;
-       uint64_t        bpl_uncomp;
-} bplist_phys_t;
-
-#define        BPLIST_SIZE_V0  (2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
-       blkptr_t        bpq_blk;
-       void            *bpq_next;
-} bplist_q_t;
+typedef struct bplist_entry {
+       blkptr_t        bpe_blk;
+       list_node_t     bpe_node;
+} bplist_entry_t;
 
 typedef struct bplist {
        kmutex_t        bpl_lock;
-       objset_t        *bpl_mos;
-       uint64_t        bpl_object;
-       uint8_t         bpl_blockshift;
-       uint8_t         bpl_bpshift;
-       uint8_t         bpl_havecomp;
-       bplist_q_t      *bpl_queue;
-       bplist_phys_t   *bpl_phys;
-       dmu_buf_t       *bpl_dbuf;
-       dmu_buf_t       *bpl_cached_dbuf;
+       list_t          bpl_list;
 } bplist_t;
 
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
+    void *arg, dmu_tx_t *tx);
 
 #ifdef __cplusplus
 }
diff --git a/module/zfs/include/sys/bpobj.h b/module/zfs/include/sys/bpobj.h
new file mode 100644 (file)
index 0000000..3771a95
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef        _SYS_BPOBJ_H
+#define        _SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+       /*
+        * This is the bonus buffer for the dead lists.  The object's
+        * contents is an array of bpo_entries blkptr_t's, representing
+        * a total of bpo_bytes physical space.
+        */
+       uint64_t        bpo_num_blkptrs;
+       uint64_t        bpo_bytes;
+       uint64_t        bpo_comp;
+       uint64_t        bpo_uncomp;
+       uint64_t        bpo_subobjs;
+       uint64_t        bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define        BPOBJ_SIZE_V0   (2 * sizeof (uint64_t))
+#define        BPOBJ_SIZE_V1   (4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+       kmutex_t        bpo_lock;
+       objset_t        *bpo_os;
+       uint64_t        bpo_object;
+       int             bpo_epb;
+       uint8_t         bpo_havecomp;
+       uint8_t         bpo_havesubobj;
+       bpobj_phys_t    *bpo_phys;
+       dmu_buf_t       *bpo_dbuf;
+       dmu_buf_t       *bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
index 2678525..4c05806 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DBUF_H
@@ -38,7 +37,6 @@
 extern "C" {
 #endif
 
-#define        DB_BONUS_BLKID (-1ULL)
 #define        IN_DMU_SYNC 2
 
 /*
@@ -75,7 +73,6 @@ typedef enum dbuf_states {
        DB_EVICTING
 } dbuf_states_t;
 
-struct objset_impl;
 struct dnode;
 struct dmu_tx;
 
@@ -134,6 +131,7 @@ typedef struct dbuf_dirty_record {
                        arc_buf_t *dr_data;
                        blkptr_t dr_overridden_by;
                        override_states_t dr_override_state;
+                       uint8_t dr_copies;
                } dl;
        } dt;
 } dbuf_dirty_record_t;
@@ -148,7 +146,7 @@ typedef struct dmu_buf_impl {
        dmu_buf_t db;
 
        /* the objset we belong to */
-       struct objset_impl *db_objset;
+       struct objset *db_objset;
 
        /*
         * the dnode we belong to (NULL when evicted)
@@ -242,6 +240,10 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
@@ -255,6 +257,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 
@@ -266,6 +269,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 
 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
@@ -273,6 +277,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
@@ -324,7 +329,7 @@ _NOTE(CONSTCOND) } while (0)
 #define        dprintf_dbuf_bp(db, bp, fmt, ...) do {                  \
        if (zfs_flags & ZFS_DEBUG_DPRINTF) {                    \
        char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);  \
-       sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);           \
+       sprintf_blkptr(__blkbuf, bp);                           \
        dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);   \
        kmem_free(__blkbuf, BP_SPRINTF_LEN);                    \
        }                                                       \
diff --git a/module/zfs/include/sys/ddt.h b/module/zfs/include/sys/ddt.h
new file mode 100644 (file)
index 0000000..9724d6e
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DDT_H
+#define        _SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+       DDT_TYPE_ZAP = 0,
+       DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+       DDT_CLASS_DITTO = 0,
+       DDT_CLASS_DUPLICATE,
+       DDT_CLASS_UNIQUE,
+       DDT_CLASSES
+};
+
+#define        DDT_TYPE_CURRENT                0
+
+#define        DDT_COMPRESS_BYTEORDER_MASK     0x80
+#define        DDT_COMPRESS_FUNCTION_MASK      0x7f
+
+/*
+ * On-disk ddt entry:  key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+       zio_cksum_t     ddk_cksum;      /* 256-bit block checksum */
+       uint64_t        ddk_prop;       /* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ *     |   0   |   0   |   0   | comp  |     PSIZE     |     LSIZE     |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define        DDK_GET_LSIZE(ddk)      \
+       BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define        DDK_SET_LSIZE(ddk, x)   \
+       BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define        DDK_GET_PSIZE(ddk)      \
+       BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define        DDK_SET_PSIZE(ddk, x)   \
+       BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define        DDK_GET_COMPRESS(ddk)           BF64_GET((ddk)->ddk_prop, 32, 8)
+#define        DDK_SET_COMPRESS(ddk, x)        BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define        DDT_KEY_WORDS   (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+       dva_t           ddp_dva[SPA_DVAS_PER_BP];
+       uint64_t        ddp_refcnt;
+       uint64_t        ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+       DDT_PHYS_DITTO = 0,
+       DDT_PHYS_SINGLE = 1,
+       DDT_PHYS_DOUBLE = 2,
+       DDT_PHYS_TRIPLE = 3,
+       DDT_PHYS_TYPES
+};
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+       ddt_key_t       dde_key;
+       ddt_phys_t      dde_phys[DDT_PHYS_TYPES];
+       zio_t           *dde_lead_zio[DDT_PHYS_TYPES];
+       void            *dde_repair_data;
+       enum ddt_type   dde_type;
+       enum ddt_class  dde_class;
+       uint8_t         dde_loading;
+       uint8_t         dde_loaded;
+       kcondvar_t      dde_cv;
+       avl_node_t      dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+       kmutex_t        ddt_lock;
+       avl_tree_t      ddt_tree;
+       avl_tree_t      ddt_repair_tree;
+       enum zio_checksum ddt_checksum;
+       spa_t           *ddt_spa;
+       objset_t        *ddt_os;
+       uint64_t        ddt_stat_object;
+       uint64_t        ddt_object[DDT_TYPES][DDT_CLASSES];
+       ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
+       ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+       ddt_object_t    ddt_object_stats[DDT_TYPES][DDT_CLASSES];
+       avl_node_t      ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+       uint64_t        ddb_class;
+       uint64_t        ddb_type;
+       uint64_t        ddb_checksum;
+       uint64_t        ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+       char ddt_op_name[32];
+       int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+           boolean_t prehash);
+       int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+       int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+       void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+           ddt_entry_t *dde);
+       int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+           dmu_tx_t *tx);
+       int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+           dmu_tx_t *tx);
+       int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+           uint64_t *walk);
+       uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define        DDT_NAMELEN     80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+    uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+    const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+    uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+    ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+    const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DDT_H */
index 3ff71b3..83932f4 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef        _SYS_DMU_H
 #define        _SYS_DMU_H
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/cred.h>
+#include <sys/time.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct uio;
+struct xuio;
 struct page;
 struct vnode;
 struct spa;
@@ -59,8 +62,9 @@ struct drr_end;
 struct zbookmark;
 struct spa;
 struct nvlist;
-struct objset_impl;
 struct arc_buf;
+struct zio_prop;
+struct sa_handle;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
@@ -73,8 +77,8 @@ typedef enum dmu_object_type {
        DMU_OT_OBJECT_ARRAY,            /* UINT64 */
        DMU_OT_PACKED_NVLIST,           /* UINT8 (XDR by nvlist_pack/unpack) */
        DMU_OT_PACKED_NVLIST_SIZE,      /* UINT64 */
-       DMU_OT_BPLIST,                  /* UINT64 */
-       DMU_OT_BPLIST_HDR,              /* UINT64 */
+       DMU_OT_BPOBJ,                   /* UINT64 */
+       DMU_OT_BPOBJ_HDR,               /* UINT64 */
        /* spa: */
        DMU_OT_SPACE_MAP_HEADER,        /* UINT64 */
        DMU_OT_SPACE_MAP,               /* UINT64 */
@@ -114,10 +118,22 @@ typedef enum dmu_object_type {
        DMU_OT_FUID,                    /* FUID table (Packed NVLIST UINT8) */
        DMU_OT_FUID_SIZE,               /* FUID table size UINT64 */
        DMU_OT_NEXT_CLONES,             /* ZAP */
-       DMU_OT_SCRUB_QUEUE,             /* ZAP */
+       DMU_OT_SCAN_QUEUE,              /* ZAP */
        DMU_OT_USERGROUP_USED,          /* ZAP */
        DMU_OT_USERGROUP_QUOTA,         /* ZAP */
        DMU_OT_USERREFS,                /* ZAP */
+       DMU_OT_DDT_ZAP,                 /* ZAP */
+       DMU_OT_DDT_STATS,               /* ZAP */
+       DMU_OT_SA,                      /* System attr */
+       DMU_OT_SA_MASTER_NODE,          /* ZAP */
+       DMU_OT_SA_ATTR_REGISTRATION,    /* ZAP */
+       DMU_OT_SA_ATTR_LAYOUTS,         /* ZAP */
+       DMU_OT_SCAN_XLATE,              /* ZAP */
+       DMU_OT_DEDUP,                   /* fake dedup BP from ddt_bp_create() */
+       DMU_OT_DEADLIST,                /* ZAP */
+       DMU_OT_DEADLIST_HDR,            /* UINT64 */
+       DMU_OT_DSL_CLONES,              /* ZAP */
+       DMU_OT_BPOBJ_SUBOBJ,            /* UINT64 */
        DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -140,16 +156,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
-#define        DS_MODE_NOHOLD          0       /* internal use only */
-#define        DS_MODE_USER            1       /* simple access, no special needs */
-#define        DS_MODE_OWNER           2       /* the "main" access, e.g. a mount */
-#define        DS_MODE_TYPE_MASK       0x3
-#define        DS_MODE_TYPE(x)         ((x) & DS_MODE_TYPE_MASK)
-#define        DS_MODE_READONLY        0x8
-#define        DS_MODE_IS_READONLY(x)  ((x) & DS_MODE_READONLY)
-#define        DS_MODE_INCONSISTENT    0x10
-#define        DS_MODE_IS_INCONSISTENT(x)      ((x) & DS_MODE_INCONSISTENT)
-
 #define        DS_FIND_SNAPSHOTS       (1<<0)
 #define        DS_FIND_CHILDREN        (1<<1)
 
@@ -162,27 +168,35 @@ void zfs_znode_byteswap(void *buf, size_t size);
 
 #define        DMU_USERUSED_OBJECT     (-1ULL)
 #define        DMU_GROUPUSED_OBJECT    (-2ULL)
+#define        DMU_DEADLIST_OBJECT     (-3ULL)
 
 /*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define        DMU_BONUS_BLKID         (-1ULL)
+#define        DMU_SPILL_BLKID         (-2ULL)
+/*
  * Public routines to create, destroy, open, and close objsets.
  */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
 int dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
 int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
     boolean_t recursive);
 int dmu_objset_rename(const char *name, const char *newname,
     boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 
@@ -201,7 +215,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define        DMU_POOL_DIRECTORY_OBJECT       1
 #define        DMU_POOL_CONFIG                 "config"
 #define        DMU_POOL_ROOT_DATASET           "root_dataset"
-#define        DMU_POOL_SYNC_BPLIST            "sync_bplist"
+#define        DMU_POOL_SYNC_BPOBJ             "sync_bplist"
 #define        DMU_POOL_ERRLOG_SCRUB           "errlog_scrub"
 #define        DMU_POOL_ERRLOG_LAST            "errlog_last"
 #define        DMU_POOL_SPARES                 "spares"
@@ -209,19 +223,12 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define        DMU_POOL_HISTORY                "history"
 #define        DMU_POOL_PROPS                  "pool_props"
 #define        DMU_POOL_L2CACHE                "l2cache"
-
-/* 4x8 zbookmark_t */
-#define        DMU_POOL_SCRUB_BOOKMARK         "scrub_bookmark"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define        DMU_POOL_SCRUB_QUEUE            "scrub_queue"
-/* 1x8 txg */
-#define        DMU_POOL_SCRUB_MIN_TXG          "scrub_min_txg"
-/* 1x8 txg */
-#define        DMU_POOL_SCRUB_MAX_TXG          "scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define        DMU_POOL_SCRUB_FUNC             "scrub_func"
-/* 1x8 count */
-#define        DMU_POOL_SCRUB_ERRORS           "scrub_errors"
+#define        DMU_POOL_TMP_USERREFS           "tmp_userrefs"
+#define        DMU_POOL_DDT                    "DDT-%s-%s-%s"
+#define        DMU_POOL_DDT_STATS              "DDT-statistics"
+#define        DMU_POOL_CREATION_VERSION       "creation_version"
+#define        DMU_POOL_SCAN                   "scan"
+#define        DMU_POOL_FREE_BPOBJ             "free_bpobj"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -306,11 +313,14 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 /*
- * Decide how many copies of a given block we should make.  Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
  */
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
-    dmu_object_type_t ot);
+#define        WP_NOFILL       0x1
+#define        WP_DMU_SYNC     0x2
+#define        WP_SPILL        0x4
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+    struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -324,6 +334,17 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+    void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
@@ -340,7 +361,7 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **);
+    void *tag, dmu_buf_t **, int flags);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
@@ -437,12 +458,35 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 
 /*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+
+/*
  * Free up the data blocks for a defined range of a file.  If size is
  * zero, the range from offset to end-of-file is freed.
  */
@@ -469,12 +513,23 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+    dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
     dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+    size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
 
 extern int zfs_prefetch_disable;
 
@@ -485,19 +540,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t len);
 
 typedef struct dmu_object_info {
-       /* All sizes are in bytes. */
+       /* All sizes are in bytes unless otherwise indicated. */
        uint32_t doi_data_block_size;
        uint32_t doi_metadata_block_size;
-       uint64_t doi_bonus_size;
        dmu_object_type_t doi_type;
        dmu_object_type_t doi_bonus_type;
+       uint64_t doi_bonus_size;
        uint8_t doi_indirection;                /* 2 = dnode->indirect->data */
        uint8_t doi_checksum;
        uint8_t doi_compress;
        uint8_t doi_pad[5];
-       /* Values below are number of 512-byte blocks. */
-       uint64_t doi_physical_blks;             /* data + metadata */
-       uint64_t doi_max_block_offset;
+       uint64_t doi_physical_blocks_512;       /* data + metadata, 512b blks */
+       uint64_t doi_max_offset;
+       uint64_t doi_fill_count;                /* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void arc_byteswap_func_t(void *buf, size_t size);
@@ -566,6 +621,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
@@ -575,6 +635,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_syncprop(objset_t *os);
+extern uint64_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@@ -582,9 +644,8 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
-typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
-    void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
-    dmu_tx_t *tx);
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+    void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
@@ -605,9 +666,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
-    struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+       struct zilog    *zgd_zilog;
+       struct blkptr   *zgd_bp;
+       dmu_buf_t       *zgd_db;
+       struct rl       *zgd_rl;
+       void            *zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
@@ -642,11 +714,12 @@ typedef struct dmu_recv_cookie {
        struct dsl_dataset *drc_real_ds;
        struct drr_begin *drc_drrb;
        char *drc_tosnap;
+       char *drc_top_ds;
        boolean_t drc_newfs;
        boolean_t drc_force;
 } dmu_recv_cookie_t;
 
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
+int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
     boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
 int dmu_recv_end(dmu_recv_cookie_t *drc);
index 3868a58..22f9f5f 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -210,8 +210,7 @@ extern "C" {
  *
  * ds_lock
  *    protects:
- *     ds_user_ptr
- *     ds_user_evict_func
+ *     ds_objset
  *     ds_open_refcount
  *     ds_snapname
  *     ds_phys accounting
@@ -233,6 +232,39 @@ extern "C" {
 struct objset;
 struct dmu_pool;
 
+typedef struct dmu_xuio {
+       int next;
+       int cnt;
+       struct arc_buf **bufs;
+       iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+       /* loaned yet not returned arc_buf */
+       kstat_named_t xuiostat_onloan_rbuf;
+       kstat_named_t xuiostat_onloan_wbuf;
+       /* whether a copy is made when loaning out a read buffer */
+       kstat_named_t xuiostat_rbuf_copied;
+       kstat_named_t xuiostat_rbuf_nocopy;
+       /* whether a copy is made when assigning a write buffer */
+       kstat_named_t xuiostat_wbuf_copied;
+       kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+       { "onloan_read_buf",    KSTAT_DATA_UINT64 },
+       { "onloan_write_buf",   KSTAT_DATA_UINT64 },
+       { "read_buf_copied",    KSTAT_DATA_UINT64 },
+       { "read_buf_nocopy",    KSTAT_DATA_UINT64 },
+       { "write_buf_copied",   KSTAT_DATA_UINT64 },
+       { "write_buf_nocopy",   KSTAT_DATA_UINT64 }
+};
+
+#define        XUIOSTAT_INCR(stat, val)        \
+       atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define        XUIOSTAT_BUMP(stat)     XUIOSTAT_INCR(stat, 1)
+
+
 #ifdef __cplusplus
 }
 #endif
index 052cb8d..5c5119a 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef        _SYS_DMU_OBJSET_H
 #define        _SYS_DMU_OBJSET_H
 
@@ -33,6 +34,7 @@
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
+#include <sys/sa.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,11 +42,13 @@ extern "C" {
 
 struct dsl_dataset;
 struct dmu_tx;
-struct objset_impl;
 
 #define        OBJSET_PHYS_SIZE 2048
 #define        OBJSET_OLD_PHYS_SIZE 1024
 
+#define        OBJSET_BUF_HAS_USERUSED(buf) \
+       (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
 #define        OBJSET_FLAG_USERACCOUNTING_COMPLETE     (1ULL<<0)
 
 typedef struct objset_phys {
@@ -59,11 +63,6 @@ typedef struct objset_phys {
 } objset_phys_t;
 
 struct objset {
-       struct objset_impl *os;
-       int os_mode;
-};
-
-typedef struct objset_impl {
        /* Immutable: */
        struct dsl_dataset *os_dsl_dataset;
        spa_t *os_spa;
@@ -73,12 +72,17 @@ typedef struct objset_impl {
        dnode_t *os_userused_dnode;
        dnode_t *os_groupused_dnode;
        zilog_t *os_zil;
-       objset_t os;
-       uint8_t os_checksum;    /* can change, under dsl_dir's locks */
-       uint8_t os_compress;    /* can change, under dsl_dir's locks */
-       uint8_t os_copies;      /* can change, under dsl_dir's locks */
-       uint8_t os_primary_cache;       /* can change, under dsl_dir's locks */
-       uint8_t os_secondary_cache;     /* can change, under dsl_dir's locks */
+
+       /* can change, under dsl_dir's locks: */
+       uint8_t os_checksum;
+       uint8_t os_compress;
+       uint8_t os_copies;
+       uint8_t os_dedup_checksum;
+       uint8_t os_dedup_verify;
+       uint8_t os_logbias;
+       uint8_t os_primary_cache;
+       uint8_t os_secondary_cache;
+       uint8_t os_sync;
 
        /* no lock needed: */
        struct dmu_tx *os_synctx; /* XXX sketchy */
@@ -101,8 +105,12 @@ typedef struct objset_impl {
        /* stuff we store for the user */
        kmutex_t os_user_ptr_lock;
        void *os_user_ptr;
-} objset_impl_t;
 
+       /* SA layout/attribute registration */
+       sa_os_t *os_sa;
+};
+
+#define        DMU_META_OBJSET         0
 #define        DMU_META_DNODE_OBJECT   0
 #define        DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
 
@@ -111,14 +119,18 @@ typedef struct objset_impl {
        (os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
 /* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
     boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
@@ -126,23 +138,26 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 int dmu_objset_find_spa(spa_t *spa, const char *name,
     int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
-int dmu_objset_prefetch(char *name, void *arg);
+int dmu_objset_prefetch(const char *name, void *arg);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 /* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
-    objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
-void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
-boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
+    objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
 
index 3e02689..844e7f1 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DMU_TRAVERSE_H
@@ -36,19 +35,24 @@ extern "C" {
 
 struct dnode_phys;
 struct dsl_dataset;
+struct zilog;
+struct arc_buf;
 
-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
-    const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+    void *arg);
 
 #define        TRAVERSE_PRE                    (1<<0)
 #define        TRAVERSE_POST                   (1<<1)
 #define        TRAVERSE_PREFETCH_METADATA      (1<<2)
 #define        TRAVERSE_PREFETCH_DATA          (1<<3)
 #define        TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define        TRAVERSE_HARD                   (1<<4)
 
-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
-    int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
+int traverse_dataset(struct dsl_dataset *ds,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
 
 #ifdef __cplusplus
 }
index 2727daa..c5ea50f 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef        _SYS_DMU_TX_H
 #define        _SYS_DMU_TX_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
@@ -59,6 +57,7 @@ struct dmu_tx {
        txg_handle_t tx_txgh;
        void *tx_tempreserve_cookie;
        struct dmu_tx_hold *tx_needassign_txh;
+       list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
        uint8_t tx_anyobj;
        int tx_err;
 #ifdef ZFS_DEBUG
@@ -78,6 +77,7 @@ enum dmu_tx_hold_type {
        THT_FREE,
        THT_ZAP,
        THT_SPACE,
+       THT_SPILL,
        THT_NUMTYPES
 };
 
@@ -98,6 +98,11 @@ typedef struct dmu_tx_hold {
 #endif
 } dmu_tx_hold_t;
 
+typedef struct dmu_tx_callback {
+       list_node_t             dcb_node;    /* linked to tx_callbacks list */
+       dmu_tx_callback_func_t  *dcb_func;   /* caller function pointer */
+       void                    *dcb_data;   /* caller private data */
+} dmu_tx_callback_t;
 
 /*
  * These routines are defined in dmu.h, and are called by the user.
@@ -109,6 +114,10 @@ void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 void dmu_tx_wait(dmu_tx_t *tx);
 
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
 /*
  * These routines are defined in dmu_spa.h, and are called by the SPA.
  */
index c94bced..78cadd2 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef        _DFETCH_H
 #define        _DFETCH_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef __cplusplus
@@ -63,6 +61,9 @@ typedef struct zfetch {
        uint64_t        zf_alloc_fail;  /* # of failed attempts to alloc strm */
 } zfetch_t;
 
+void           zfetch_init(void);
+void           zfetch_fini(void);
+
 void           dmu_zfetch_init(zfetch_t *, struct dnode *);
 void           dmu_zfetch_rele(zfetch_t *);
 void           dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
index 48e4da8..8bae160 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DNODE_H
@@ -63,6 +62,18 @@ extern "C" {
 #define        DN_MAX_OFFSET_SHIFT     64      /* 2^64 bytes in a dnode */
 
 /*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define        DN_ID_CHKED_BONUS       0x1
+#define        DN_ID_CHKED_SPILL       0x2
+#define        DN_ID_OLD_EXIST         0x4
+#define        DN_ID_NEW_EXIST         0x8
+
+/*
  * Derived constants.
  */
 #define        DNODE_SIZE      (1 << DNODE_SHIFT)
@@ -70,10 +81,12 @@ extern "C" {
 #define        DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
 #define        DN_MAX_OBJECT   (1ULL << DN_MAX_OBJECT_SHIFT)
 #define        DN_ZERO_BONUSLEN        (DN_MAX_BONUSLEN + 1)
+#define        DN_KILL_SPILLBLK (1)
 
 #define        DNODES_PER_BLOCK_SHIFT  (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define        DNODES_PER_BLOCK        (1ULL << DNODES_PER_BLOCK_SHIFT)
 #define        DNODES_PER_LEVEL_SHIFT  (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define        DNODES_PER_LEVEL        (1ULL << DNODES_PER_LEVEL_SHIFT)
 
 /* The +2 here is a cheesy way to round up */
 #define        DN_MAX_LEVELS   (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
@@ -88,7 +101,7 @@ extern "C" {
 #define        EPB(blkshift, typeshift)        (1 << (blkshift - typeshift))
 
 struct dmu_buf_impl;
-struct objset_impl;
+struct objset;
 struct zio;
 
 enum dnode_dirtycontext {
@@ -101,6 +114,9 @@ enum dnode_dirtycontext {
 #define        DNODE_FLAG_USED_BYTES           (1<<0)
 #define        DNODE_FLAG_USERUSED_ACCOUNTED   (1<<1)
 
+/* Does dnode have a SA spill blkptr in bonus? */
+#define        DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
 typedef struct dnode_phys {
        uint8_t dn_type;                /* dmu_object_type_t */
        uint8_t dn_indblkshift;         /* ln2(indirect block size) */
@@ -121,7 +137,8 @@ typedef struct dnode_phys {
        uint64_t dn_pad3[4];
 
        blkptr_t dn_blkptr[1];
-       uint8_t dn_bonus[DN_MAX_BONUSLEN];
+       uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+       blkptr_t dn_spill;
 } dnode_phys_t;
 
 typedef struct dnode {
@@ -136,7 +153,7 @@ typedef struct dnode {
        list_node_t dn_link;
 
        /* immutable: */
-       struct objset_impl *dn_objset;
+       struct objset *dn_objset;
        uint64_t dn_object;
        struct dmu_buf_impl *dn_dbuf;
        dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
@@ -161,6 +178,8 @@ typedef struct dnode {
        uint8_t dn_next_nblkptr[TXG_SIZE];
        uint8_t dn_next_nlevels[TXG_SIZE];
        uint8_t dn_next_indblkshift[TXG_SIZE];
+       uint8_t dn_next_bonustype[TXG_SIZE];
+       uint8_t dn_rm_spillblk[TXG_SIZE];       /* for removing spill blk */
        uint16_t dn_next_bonuslen[TXG_SIZE];
        uint32_t dn_next_blksz[TXG_SIZE];       /* next block size in bytes */
 
@@ -185,12 +204,17 @@ typedef struct dnode {
        kmutex_t dn_dbufs_mtx;
        list_t dn_dbufs;                /* linked list of descendent dbuf_t's */
        struct dmu_buf_impl *dn_bonus;  /* bonus buffer dbuf */
+       boolean_t dn_have_spill;        /* have spill or are spilling */
 
        /* parent IO for current sync write */
        zio_t *dn_zio;
 
        /* used in syncing context */
-       dnode_phys_t *dn_oldphys;
+       uint64_t dn_oldused;    /* old phys used bytes */
+       uint64_t dn_oldflags;   /* old phys dn_flags */
+       uint64_t dn_olduid, dn_oldgid;
+       uint64_t dn_newuid, dn_newgid;
+       int dn_id_flags;
 
        /* holds prefetch structure */
        struct zfetch   dn_zfetch;
@@ -202,14 +226,17 @@ typedef struct free_range {
        uint64_t fr_nblks;
 } free_range_t;
 
-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
     uint64_t object);
 void dnode_special_close(dnode_t *dn);
 
 void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
-int dnode_hold(struct objset_impl *dd, uint64_t object,
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
+int dnode_hold(struct objset *dd, uint64_t object,
     void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
index b51036d..58414e1 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_DATASET_H
@@ -33,6 +32,7 @@
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,8 +42,6 @@ struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;
 
-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
 #define        DS_FLAG_INCONSISTENT    (1ULL<<0)
 #define        DS_IS_INCONSISTENT(ds)  \
        ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
@@ -85,7 +83,7 @@ typedef struct dsl_dataset_phys {
        uint64_t ds_num_children;       /* clone/snap children; ==0 for head */
        uint64_t ds_creation_time;      /* seconds since 1970 */
        uint64_t ds_creation_txg;
-       uint64_t ds_deadlist_obj;       /* DMU_OT_BPLIST */
+       uint64_t ds_deadlist_obj;       /* DMU_OT_DEADLIST */
        uint64_t ds_used_bytes;
        uint64_t ds_compressed_bytes;
        uint64_t ds_uncompressed_bytes;
@@ -115,10 +113,10 @@ typedef struct dsl_dataset {
 
        /* only used in syncing context, only valid for non-snapshots: */
        struct dsl_dataset *ds_prev;
-       uint64_t ds_origin_txg;
 
        /* has internal locking: */
-       bplist_t ds_deadlist;
+       dsl_deadlist_t ds_deadlist;
+       bplist_t ds_pending_deadlist;
 
        /* to protect against multiple concurrent incremental recv */
        kmutex_t ds_recvlock;
@@ -132,8 +130,7 @@ typedef struct dsl_dataset {
         * Protected by ds_lock:
         */
        kmutex_t ds_lock;
-       void *ds_user_ptr;
-       dsl_dataset_evict_func_t *ds_user_evict_func;
+       objset_t *ds_objset;
        uint64_t ds_userrefs;
 
        /*
@@ -165,7 +162,7 @@ struct dsl_ds_destroyarg {
        boolean_t need_prep;            /* do we need to retry due to EBUSY? */
 };
 
-#define        dsl_dataset_is_snapshot(ds)     \
+#define        dsl_dataset_is_snapshot(ds) \
        ((ds)->ds_phys->ds_num_children != 0)
 
 #define        DS_UNIQUE_IS_ACCURATE(ds)       \
@@ -174,17 +171,17 @@ struct dsl_ds_destroyarg {
 int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, int flags, void *owner,
-    dsl_dataset_t **dsp);
+int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
-    int flags, void *owner, dsl_dataset_t **);
+    boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
-    void *owner);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
+    void *tag);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -195,21 +192,18 @@ dsl_checkfunc_t dsl_dataset_destroy_check;
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
+int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
 int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive);
+    boolean_t recursive, boolean_t temphold);
 int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
     boolean_t recursive);
+int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+    char *htag);
 int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
 
-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
-
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
@@ -219,10 +213,12 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
-boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+    dmu_tx_t *tx, boolean_t async);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+    uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
@@ -238,13 +234,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used,
     uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx);
-int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
-void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags);
-int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation,
-    dmu_tx_t *tx);
+int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+    uint64_t quota);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
+int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+    uint64_t reservation);
+
+int dsl_destroy_inconsistent(const char *dsname, void *arg);
 
 #ifdef ZFS_DEBUG
 #define        dprintf_ds(ds, fmt, ...) do { \
diff --git a/module/zfs/include/sys/dsl_deadlist.h b/module/zfs/include/sys/dsl_deadlist.h
new file mode 100644 (file)
index 0000000..d2c16d7
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef        _SYS_DSL_DEADLIST_H
+#define        _SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+       uint64_t dl_used;
+       uint64_t dl_comp;
+       uint64_t dl_uncomp;
+       uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+       objset_t *dl_os;
+       uint64_t dl_object;
+       avl_tree_t dl_tree;
+       boolean_t dl_havetree;
+       struct dmu_buf *dl_dbuf;
+       dsl_deadlist_phys_t *dl_phys;
+       kmutex_t dl_lock;
+
+       /* if it's the old on-disk format: */
+       bpobj_t dl_bpobj;
+       boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+       avl_node_t dle_node;
+       uint64_t dle_mintxg;
+       bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+    uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
index 56d0638..2191635 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_DIR_H
@@ -70,7 +69,8 @@ typedef struct dsl_dir_phys {
        uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
        uint64_t dd_flags;
        uint64_t dd_used_breakdown[DD_USED_NUM];
-       uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+       uint64_t dd_clones; /* dsl_dir objects */
+       uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;
 
 struct dsl_dir {
@@ -89,6 +89,8 @@ struct dsl_dir {
        /* Protected by dd_lock */
        kmutex_t dd_lock;
        list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+       timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+       uint64_t dd_origin_txg;
 
        /* gross estimate of space used by in-flight tx's */
        uint64_t dd_tempreserved[TXG_SIZE];
@@ -125,18 +127,24 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
 void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+    uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation);
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
 int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define        MOS_DIR_NAME "$MOS"
 #define        ORIGIN_DIR_NAME "$ORIGIN"
+#define        XLATION_DIR_NAME "$XLATION"
+#define        FREE_DIR_NAME "$FREE"
 
 #ifdef ZFS_DEBUG
 #define        dprintf_dd(dd, fmt, ...) do { \
index d8da295..7d25bd7 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_POOL_H
@@ -32,6 +31,9 @@
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/dnode.h>
+#include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,12 +44,7 @@ struct dsl_dir;
 struct dsl_dataset;
 struct dsl_pool;
 struct dmu_tx;
-
-enum scrub_func {
-       SCRUB_FUNC_NONE,
-       SCRUB_FUNC_CLEAN,
-       SCRUB_FUNC_NUMFUNCS
-};
+struct dsl_scan;
 
 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define        DMU_OT_DEFERRED DMU_OT_NONE
@@ -75,6 +72,7 @@ typedef struct dsl_pool {
        struct objset *dp_meta_objset;
        struct dsl_dir *dp_root_dir;
        struct dsl_dir *dp_mos_dir;
+       struct dsl_dir *dp_free_dir;
        struct dsl_dataset *dp_origin_snap;
        uint64_t dp_root_dir_obj;
        struct taskq *dp_vnrele_taskq;
@@ -83,25 +81,18 @@ typedef struct dsl_pool {
        blkptr_t dp_meta_rootbp;
        list_t dp_synced_datasets;
        hrtime_t dp_read_overhead;
-       uint64_t dp_throughput;
+       uint64_t dp_throughput; /* bytes per millisec */
        uint64_t dp_write_limit;
+       uint64_t dp_tmp_userrefs_obj;
+       bpobj_t dp_free_bpobj;
+
+       struct dsl_scan *dp_scan;
 
        /* Uses dp_lock */
        kmutex_t dp_lock;
        uint64_t dp_space_towrite[TXG_SIZE];
        uint64_t dp_tempreserved[TXG_SIZE];
 
-       enum scrub_func dp_scrub_func;
-       uint64_t dp_scrub_queue_obj;
-       uint64_t dp_scrub_min_txg;
-       uint64_t dp_scrub_max_txg;
-       zbookmark_t dp_scrub_bookmark;
-       boolean_t dp_scrub_pausing;
-       boolean_t dp_scrub_isresilver;
-       uint64_t dp_scrub_start_time;
-       kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
-       boolean_t dp_scrub_restart;
-
        /* Has its own locking */
        tx_state_t dp_tx;
        txg_list_t dp_dirty_datasets;
@@ -123,29 +114,36 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
 int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
 void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_memory_pressure(dsl_pool_t *dp);
 void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
-    struct dmu_tx *tx);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+    const blkptr_t *bpp);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
 void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
 
+extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t *now, dmu_tx_t *tx);
+extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, dmu_tx_t *tx);
+extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+
 #ifdef __cplusplus
 }
 #endif
index 5afaa1f..a636ad3 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_PROP_H
@@ -49,6 +48,25 @@ typedef struct dsl_prop_cb_record {
        void *cbr_arg;
 } dsl_prop_cb_record_t;
 
+typedef struct dsl_props_arg {
+       nvlist_t *pa_props;
+       zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+typedef struct dsl_prop_set_arg {
+       const char *psa_name;
+       zprop_source_t psa_source;
+       int psa_intsz;
+       int psa_numints;
+       const void *psa_value;
+
+       /*
+        * Used to handle the special requirements of the quota and reservation
+        * properties.
+        */
+       uint64_t psa_effective_value;
+} dsl_prop_setarg_t;
+
 int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg);
 int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
@@ -59,18 +77,36 @@ int dsl_prop_get(const char *ddname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
     uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
 int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint);
+    int intsz, int numints, void *buf, char *setpoint,
+    boolean_t snapshot);
 
 dsl_syncfunc_t dsl_props_set_sync;
 int dsl_prop_set(const char *ddname, const char *propname,
-    int intsz, int numints, const void *buf);
-int dsl_props_set(const char *dsname, nvlist_t *nvl);
+    zprop_source_t source, int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
 void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx);
+    dmu_tx_t *tx);
+
+void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+    zprop_source_t source, uint64_t *value);
+int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#ifdef ZFS_DEBUG
+void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#define        DSL_PROP_CHECK_PREDICTION(dd, psa)      \
+       dsl_prop_check_prediction((dd), (psa))
+#else
+#define        DSL_PROP_CHECK_PREDICTION(dd, psa)      /* nothing */
+#endif
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(objset_t *os);
+void dsl_prop_set_hasrecvd(objset_t *os);
+void dsl_prop_unset_hasrecvd(objset_t *os);
 
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
diff --git a/module/zfs/include/sys/dsl_scan.h b/module/zfs/include/sys/dsl_scan.h
new file mode 100644 (file)
index 0000000..c79666e
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef        _SYS_DSL_SCAN_H
+#define        _SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+       uint64_t scn_func; /* pool_scan_func_t */
+       uint64_t scn_state; /* dsl_scan_state_t */
+       uint64_t scn_queue_obj;
+       uint64_t scn_min_txg;
+       uint64_t scn_max_txg;
+       uint64_t scn_cur_min_txg;
+       uint64_t scn_cur_max_txg;
+       uint64_t scn_start_time;
+       uint64_t scn_end_time;
+       uint64_t scn_to_examine; /* total bytes to be scanned */
+       uint64_t scn_examined; /* bytes scanned so far */
+       uint64_t scn_to_process;
+       uint64_t scn_processed;
+       uint64_t scn_errors;    /* scan I/O error count */
+       uint64_t scn_ddt_class_max;
+       ddt_bookmark_t scn_ddt_bookmark;
+       zbookmark_t scn_bookmark;
+       uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define        SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+       DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+       struct dsl_pool *scn_dp;
+
+       boolean_t scn_pausing;
+       uint64_t scn_restart_txg;
+       uint64_t scn_sync_start_time;
+       zio_t *scn_zio_root;
+
+       /* for debugging / information */
+       uint64_t scn_visited_this_txg;
+
+       dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+    struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
index 4995bfe..9126290 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_SYNCTASK_H
 #define        _SYS_DSL_SYNCTASK_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/txg.h>
 #include <sys/zfs_context.h>
 
@@ -38,7 +35,7 @@ extern "C" {
 struct dsl_pool;
 
 typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
 
 typedef struct dsl_sync_task {
        list_node_t dst_node;
@@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group {
        txg_node_t dstg_node;
        list_t dstg_tasks;
        struct dsl_pool *dstg_pool;
-       cred_t *dstg_cr;
        uint64_t dstg_txg;
        int dstg_err;
        int dstg_space;
index 21b7dbe..c752edc 100644 (file)
@@ -68,6 +68,18 @@ extern "C" {
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET       "zio_offset"
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE         "zio_size"
 #define        FM_EREPORT_PAYLOAD_ZFS_PREV_STATE       "prev_state"
+#define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED   "cksum_expected"
+#define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL     "cksum_actual"
+#define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO       "cksum_algorithm"
+#define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP   "cksum_byteswap"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS   "bad_range_sets"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS     "bad_set_bits"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
+#define        FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
 
 #define        FM_EREPORT_FAILMODE_WAIT                "wait"
 #define        FM_EREPORT_FAILMODE_CONTINUE            "continue"
@@ -75,6 +87,7 @@ extern "C" {
 
 #define        FM_RESOURCE_REMOVED                     "removed"
 #define        FM_RESOURCE_AUTOREPLACE                 "autoreplace"
+#define        FM_RESOURCE_STATECHANGE                 "statechange"
 
 #ifdef __cplusplus
 }
index 767fb07..c4103c4 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_FM_PROTOCOL_H
@@ -47,6 +46,7 @@ extern "C" {
 /* FM event class values */
 #define        FM_EREPORT_CLASS                "ereport"
 #define        FM_FAULT_CLASS                  "fault"
+#define        FM_DEFECT_CLASS                 "defect"
 #define        FM_RSRC_CLASS                   "resource"
 #define        FM_LIST_EVENT                   "list"
 
@@ -83,6 +83,7 @@ extern "C" {
 #define        FM_SUSPECT_FAULT_LIST           "fault-list"
 #define        FM_SUSPECT_FAULT_SZ             "fault-list-sz"
 #define        FM_SUSPECT_FAULT_STATUS         "fault-status"
+#define        FM_SUSPECT_INJECTED             "__injected"
 #define        FM_SUSPECT_MESSAGE              "message"
 #define        FM_SUSPECT_RETIRE               "retire"
 #define        FM_SUSPECT_RESPONSE             "response"
@@ -122,6 +123,7 @@ extern "C" {
 #define        FM_RSRC_ASRU_REPAIRED           "repaired"
 #define        FM_RSRC_ASRU_REPLACED           "replaced"
 #define        FM_RSRC_ASRU_ACQUITTED          "acquitted"
+#define        FM_RSRC_ASRU_RESOLVED           "resolved"
 #define        FM_RSRC_ASRU_UNUSABLE           "unusable"
 #define        FM_RSRC_ASRU_EVENT              "event"
 
@@ -170,6 +172,7 @@ extern "C" {
 
 /* FMRI authority-type member names */
 #define        FM_FMRI_AUTH_CHASSIS            "chassis-id"
+#define        FM_FMRI_AUTH_PRODUCT_SN         "product-sn"
 #define        FM_FMRI_AUTH_PRODUCT            "product-id"
 #define        FM_FMRI_AUTH_DOMAIN             "domain-id"
 #define        FM_FMRI_AUTH_SERVER             "server-id"
@@ -243,6 +246,7 @@ extern "C" {
 
 /* dev scheme member names */
 #define        FM_FMRI_DEV_ID                  "devid"
+#define        FM_FMRI_DEV_TGTPTLUN0           "target-port-l0id"
 #define        FM_FMRI_DEV_PATH                "device-path"
 
 /* pkg scheme member names */
@@ -311,7 +315,7 @@ extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
 extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
     int, ...);
 extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
-    const char *);
+    const char *, const char *);
 extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
 extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
     uint8_t *, const char *);
@@ -320,6 +324,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
 extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
     const char *, const char *);
 extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+    nvlist_t *, int, ...);
 
 extern uint64_t fm_ena_increment(uint64_t);
 extern uint64_t fm_ena_generate(uint64_t, uchar_t);
index 5d3e11c..583d630 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -36,9 +35,6 @@
 extern "C" {
 #endif
 
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
 extern space_map_ops_t *zfs_metaslab_ops;
 
 extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
@@ -46,6 +42,7 @@ extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
 
 #define        METASLAB_HINTBP_FAVOR   0x0
 #define        METASLAB_HINTBP_AVOID   0x1
@@ -57,14 +54,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
 
-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+    space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
+
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+    int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
 
 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
     vdev_t *vd);
 extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);
 
 #ifdef __cplusplus
 }
index d67dea7..07988dd 100644 (file)
@@ -37,16 +37,23 @@ extern "C" {
 #endif
 
 struct metaslab_class {
+       spa_t                   *mc_spa;
        metaslab_group_t        *mc_rotor;
-       uint64_t                mc_allocated;
        space_map_ops_t         *mc_ops;
+       uint64_t                mc_aliquot;
+       uint64_t                mc_alloc;       /* total allocated space */
+       uint64_t                mc_deferred;    /* total deferred frees */
+       uint64_t                mc_space;       /* total space (alloc + free) */
+       uint64_t                mc_dspace;      /* total deflated space */
 };
 
 struct metaslab_group {
        kmutex_t                mg_lock;
        avl_tree_t              mg_metaslab_tree;
        uint64_t                mg_aliquot;
+       uint64_t                mg_bonus_area;
        int64_t                 mg_bias;
+       int64_t                 mg_activation_count;
        metaslab_class_t        *mg_class;
        vdev_t                  *mg_vd;
        metaslab_group_t        *mg_prev;
@@ -66,7 +73,9 @@ struct metaslab {
        space_map_obj_t ms_smo_syncing; /* syncing space map object     */
        space_map_t     ms_allocmap[TXG_SIZE];  /* allocated this txg   */
        space_map_t     ms_freemap[TXG_SIZE];   /* freed this txg       */
+       space_map_t     ms_defermap[TXG_DEFER_SIZE]; /* deferred frees  */
        space_map_t     ms_map;         /* in-core free space map       */
+       int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
        uint64_t        ms_weight;      /* weight vs. others in group   */
        metaslab_group_t *ms_group;     /* metaslab group               */
        avl_node_t      ms_group_node;  /* node in metaslab group tree  */
index d3fe7b1..bc3ade8 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_REFCOUNT_H
 #define        _SYS_REFCOUNT_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
@@ -91,6 +88,11 @@ typedef struct refcount {
        atomic_add_64_nv(&(rc)->rc_count, number)
 #define        refcount_remove_many(rc, number, holder) \
        atomic_add_64_nv(&(rc)->rc_count, -number)
+#define        refcount_transfer(dst, src) { \
+       uint64_t __tmp = (src)->rc_count; \
+       atomic_add_64(&(src)->rc_count, -__tmp); \
+       atomic_add_64(&(dst)->rc_count, __tmp); \
+}
 
 #define        refcount_init()
 #define        refcount_fini()
diff --git a/module/zfs/include/sys/sa.h b/module/zfs/include/sys/sa.h
new file mode 100644 (file)
index 0000000..e9a96a0
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_SA_H
+#define        _SYS_SA_H
+
+#include <sys/dmu.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+       SA_UINT64_ARRAY,
+       SA_UINT32_ARRAY,
+       SA_UINT16_ARRAY,
+       SA_UINT8_ARRAY,
+       SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t       sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+       char                    *sa_name;       /* attribute name */
+       uint16_t                sa_length;
+       sa_bswap_type_t         sa_byteswap;    /* bswap functon enum */
+       sa_attr_type_t          sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+    boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+       void                    *sa_data;
+       sa_data_locator_t       *sa_data_func;
+       uint16_t                sa_length;
+       sa_attr_type_t          sa_attr;
+       /* the following are private to the sa framework */
+       void                    *sa_addr;
+       uint16_t                sa_buftype;
+       uint16_t                sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define        SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+       b[idx].sa_attr = attr;\
+       b[idx].sa_data_func = func; \
+       b[idx].sa_data = data; \
+       b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+       SA_HDL_SHARED,
+       SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+    sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+    sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+    uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+    uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void sa_update_user(sa_handle_t *, sa_handle_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+    int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+    int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_H */
diff --git a/module/zfs/include/sys/sa_impl.h b/module/zfs/include/sys/sa_impl.h
new file mode 100644 (file)
index 0000000..62497e7
--- /dev/null
@@ -0,0 +1,288 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_SA_IMPL_H
+#define        _SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+       sa_attr_type_t  sa_attr;
+       uint8_t sa_registered;
+       uint16_t sa_length;
+       sa_bswap_type_t sa_byteswap;
+       char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64      56      48      40      32      24      16      8       0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |        unused         |      len      | bswap |   attr num    |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16       0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ *  ......
+ *
+ */
+
+#define        ATTR_BSWAP(x)   BF32_GET(x, 16, 8)
+#define        ATTR_LENGTH(x)  BF32_GET(x, 24, 16)
+#define        ATTR_NUM(x)     BF32_GET(x, 0, 16)
+#define        ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+       BF64_SET(x, 24, 16, length); \
+       BF64_SET(x, 16, 8, bswap); \
+       BF64_SET(x, 0, 16, attr); \
+}
+
+#define        TOC_OFF(x)              BF32_GET(x, 0, 23)
+#define        TOC_ATTR_PRESENT(x)     BF32_GET(x, 31, 1)
+#define        TOC_LEN_IDX(x)          BF32_GET(x, 24, 4)
+#define        TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+       BF32_SET(x, 31, 1, 1); \
+       BF32_SET(x, 24, 7, len_idx); \
+       BF32_SET(x, 0, 24, offset); \
+}
+
+#define        SA_LAYOUTS      "LAYOUTS"
+#define        SA_REGISTRY     "REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+       avl_node_t lot_num_node;
+       avl_node_t lot_hash_node;
+       uint64_t lot_num;
+       uint64_t lot_hash;
+       sa_attr_type_t *lot_attrs;      /* array of attr #'s */
+       uint32_t lot_var_sizes; /* how many aren't fixed size */
+       uint32_t lot_attr_count;        /* total attr count */
+       list_t  lot_idx_tab;    /* should be only a couple of entries */
+       int     lot_instance;   /* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+       list_node_t     sa_next;
+       sa_lot_t        *sa_layout;
+       uint16_t        *sa_variable_lengths;
+       refcount_t      sa_refcount;
+       uint32_t        *sa_idx_tab;    /* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read.  The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout.  Both
+ * of these tree's are interconnected.  The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+       kmutex_t        sa_lock;
+       boolean_t       sa_need_attr_registration;
+       boolean_t       sa_force_spill;
+       uint64_t        sa_master_obj;
+       uint64_t        sa_reg_attr_obj;
+       uint64_t        sa_layout_attr_obj;
+       int             sa_num_attrs;
+       sa_attr_table_t *sa_attr_table;  /* private attr table */
+       sa_update_cb_t  *sa_update_cb;
+       avl_tree_t      sa_layout_num_tree;  /* keyed by layout number */
+       avl_tree_t      sa_layout_hash_tree; /* keyed by layout hash value */
+       int             sa_user_table_sz;
+       sa_attr_type_t  *sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define        SA_MAGIC        0x2F505A  /* ZFS SA */
+typedef struct sa_hdr_phys {
+       uint32_t sa_magic;
+       uint16_t sa_layout_info;  /* Encoded with hdrsize and layout number */
+       uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+       /* ... Data follows the lengths.  */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16      10       0
+ * +--------+-------+
+ * | hdrsz  |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ *          2 ==> 16 byte header
+ *
+ */
+
+#define        SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define        SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define        SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+       BF32_SET_SB(x, 10, 6, 3, 0, size); \
+       BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+       SA_BONUS = 1,
+       SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+       SA_LOOKUP,
+       SA_UPDATE,
+       SA_ADD,
+       SA_REPLACE,
+       SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+       kmutex_t        sa_lock;
+       dmu_buf_t       *sa_bonus;
+       dmu_buf_t       *sa_spill;
+       objset_t        *sa_os;
+       void            *sa_userp;
+       sa_idx_tab_t    *sa_bonus_tab;   /* idx of bonus */
+       sa_idx_tab_t    *sa_spill_tab; /* only present if spill activated */
+};
+
+#define        SA_GET_DB(hdl, type)    \
+       (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define        SA_GET_HDR(hdl, type) \
+       ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+       type))->db.db_data))
+
+#define        SA_IDX_TAB_GET(hdl, type) \
+       (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define        IS_SA_BONUSTYPE(a)      \
+       ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define        SA_BONUSTYPE_FROM_DB(db) \
+       (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+
+#define        SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define        SA_LAYOUT_NUM(x, type) \
+       ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+       ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define        SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define        SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+       hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+       SA_REGISTERED_LEN(sa, attr))
+
+#define        SA_SET_HDR(hdr, num, size) \
+       { \
+               hdr->sa_magic = SA_MAGIC; \
+               SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+       }
+
+#define        SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+       { \
+               bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+               bulk.sa_buftype = type; \
+               bulk.sa_addr = \
+                   (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+                   (uintptr_t)hdr); \
+}
+
+#define        SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+       (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+       (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+       sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+    uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+    uint16_t *, sa_hdr_phys_t *);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_IMPL_H */
index 0a4d550..41a4030 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -43,8 +42,13 @@ extern "C" {
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;
 
 /*
@@ -134,15 +138,15 @@ typedef struct zio_cksum {
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  * 5   |G|                      offset3                                |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6   |E| lvl | type  | cksum | comp  |     PSIZE     |     LSIZE     |
+ * 6   |BDX|lvl| type  | cksum | comp  |     PSIZE     |     LSIZE     |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  * 7   |                       padding                                 |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  * 8   |                       padding                                 |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9   |                       padding                                 |
+ * 9   |                       physical birth txg                      |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
- * a   |                       birth txg                               |
+ * a   |                       logical birth txg                       |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  * b   |                       fill count                              |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -166,25 +170,29 @@ typedef struct zio_cksum {
  * cksum       checksum function
  * comp                compression function
  * G           gang block indicator
- * E           endianness
- * type                DMU object type
+ * B           byteorder (endianness)
+ * D           dedup
+ * X           unused
  * lvl         level of indirection
- * birth txg   transaction group in which the block was born
+ * type                DMU object type
+ * phys birth  txg of block allocation; zero if same as logical birth txg
+ * log. birth  transaction group in which the block was logically born
  * fill count  number of non-zero blocks under this bp
  * checksum[4] 256-bit checksum of the data this bp describes
  */
-typedef struct blkptr {
-       dva_t           blk_dva[3];     /* 128-bit Data Virtual Address */
-       uint64_t        blk_prop;       /* size, compression, type, etc */
-       uint64_t        blk_pad[3];     /* Extra space for the future   */
-       uint64_t        blk_birth;      /* transaction group at birth   */
-       uint64_t        blk_fill;       /* fill count                   */
-       zio_cksum_t     blk_cksum;      /* 256-bit checksum             */
-} blkptr_t;
-
 #define        SPA_BLKPTRSHIFT 7               /* blkptr_t is 128 bytes        */
 #define        SPA_DVAS_PER_BP 3               /* Number of DVAs in a bp       */
 
+typedef struct blkptr {
+       dva_t           blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+       uint64_t        blk_prop;       /* size, compression, type, etc     */
+       uint64_t        blk_pad[2];     /* Extra space for the future       */
+       uint64_t        blk_phys_birth; /* txg when block was allocated     */
+       uint64_t        blk_birth;      /* transaction group at birth       */
+       uint64_t        blk_fill;       /* fill count                       */
+       zio_cksum_t     blk_cksum;      /* 256-bit checksum                 */
+} blkptr_t;
+
 /*
  * Macros to get and set fields in a bp or DVA.
  */
@@ -208,8 +216,7 @@ typedef struct blkptr {
 #define        DVA_SET_GANG(dva, x)    BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define        BP_GET_LSIZE(bp)        \
-       (BP_IS_HOLE(bp) ? 0 : \
-       BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+       BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
 #define        BP_SET_LSIZE(bp, x)     \
        BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 
@@ -218,20 +225,35 @@ typedef struct blkptr {
 #define        BP_SET_PSIZE(bp, x)     \
        BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
 
-#define        BP_GET_COMPRESS(bp)     BF64_GET((bp)->blk_prop, 32, 8)
-#define        BP_SET_COMPRESS(bp, x)  BF64_SET((bp)->blk_prop, 32, 8, x)
+#define        BP_GET_COMPRESS(bp)             BF64_GET((bp)->blk_prop, 32, 8)
+#define        BP_SET_COMPRESS(bp, x)          BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define        BP_GET_CHECKSUM(bp)             BF64_GET((bp)->blk_prop, 40, 8)
+#define        BP_SET_CHECKSUM(bp, x)          BF64_SET((bp)->blk_prop, 40, 8, x)
 
-#define        BP_GET_CHECKSUM(bp)     BF64_GET((bp)->blk_prop, 40, 8)
-#define        BP_SET_CHECKSUM(bp, x)  BF64_SET((bp)->blk_prop, 40, 8, x)
+#define        BP_GET_TYPE(bp)                 BF64_GET((bp)->blk_prop, 48, 8)
+#define        BP_SET_TYPE(bp, x)              BF64_SET((bp)->blk_prop, 48, 8, x)
 
-#define        BP_GET_TYPE(bp)         BF64_GET((bp)->blk_prop, 48, 8)
-#define        BP_SET_TYPE(bp, x)      BF64_SET((bp)->blk_prop, 48, 8, x)
+#define        BP_GET_LEVEL(bp)                BF64_GET((bp)->blk_prop, 56, 5)
+#define        BP_SET_LEVEL(bp, x)             BF64_SET((bp)->blk_prop, 56, 5, x)
 
-#define        BP_GET_LEVEL(bp)        BF64_GET((bp)->blk_prop, 56, 5)
-#define        BP_SET_LEVEL(bp, x)     BF64_SET((bp)->blk_prop, 56, 5, x)
+#define        BP_GET_PROP_BIT_61(bp)          BF64_GET((bp)->blk_prop, 61, 1)
+#define        BP_SET_PROP_BIT_61(bp, x)       BF64_SET((bp)->blk_prop, 61, 1, x)
 
-#define        BP_GET_BYTEORDER(bp)    (0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define        BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define        BP_GET_DEDUP(bp)                BF64_GET((bp)->blk_prop, 62, 1)
+#define        BP_SET_DEDUP(bp, x)             BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define        BP_GET_BYTEORDER(bp)            (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define        BP_SET_BYTEORDER(bp, x)         BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define        BP_PHYSICAL_BIRTH(bp)           \
+       ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define        BP_SET_BIRTH(bp, logical, physical)     \
+{                                              \
+       (bp)->blk_birth = (logical);            \
+       (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
 
 #define        BP_GET_ASIZE(bp)        \
        (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -239,7 +261,7 @@ typedef struct blkptr {
 
 #define        BP_GET_UCSIZE(bp) \
        ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
-       BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+       BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define        BP_GET_NDVAS(bp)        \
        (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@@ -255,6 +277,12 @@ typedef struct blkptr {
        ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
        (dva1)->dva_word[0] == (dva2)->dva_word[0])
 
+#define        BP_EQUAL(bp1, bp2)      \
+       (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&    \
+       DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&    \
+       DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&    \
+       DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
 #define        ZIO_CHECKSUM_EQUAL(zc1, zc2) \
        (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
        ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@@ -274,7 +302,10 @@ typedef struct blkptr {
 #define        BP_IDENTITY(bp)         (&(bp)->blk_dva[0])
 #define        BP_IS_GANG(bp)          DVA_GET_GANG(BP_IDENTITY(bp))
 #define        BP_IS_HOLE(bp)          ((bp)->blk_birth == 0)
-#define        BP_IS_OLDER(bp, txg)    (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define        BP_IS_RAIDZ(bp)         (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+                               BP_GET_PSIZE(bp))
 
 #define        BP_ZERO(bp)                             \
 {                                              \
@@ -287,14 +318,12 @@ typedef struct blkptr {
        (bp)->blk_prop = 0;                     \
        (bp)->blk_pad[0] = 0;                   \
        (bp)->blk_pad[1] = 0;                   \
-       (bp)->blk_pad[2] = 0;                   \
+       (bp)->blk_phys_birth = 0;               \
        (bp)->blk_birth = 0;                    \
        (bp)->blk_fill = 0;                     \
        ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
 }
 
-#define        BLK_FILL_ALREADY_FREED  (-1ULL)
-
 /*
  * Note: the byteorder is either 0 or -1, both of which are palindromes.
  * This simplifies the endianness handling a bit.
@@ -309,17 +338,81 @@ typedef struct blkptr {
 
 #define        BP_SPRINTF_LEN  320
 
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define        SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)     \
+{                                                                      \
+       static const char *copyname[] =                                 \
+           { "zero", "single", "double", "triple" };                   \
+       int size = BP_SPRINTF_LEN;                                      \
+       int len = 0;                                                    \
+       int copies = 0;                                                 \
+                                                                       \
+       if (bp == NULL) {                                               \
+               len = func(buf + len, size - len, "<NULL>");            \
+       } else if (BP_IS_HOLE(bp)) {                                    \
+               len = func(buf + len, size - len, "<hole>");            \
+       } else {                                                        \
+               for (int d = 0; d < BP_GET_NDVAS(bp); d++) {            \
+                       const dva_t *dva = &bp->blk_dva[d];             \
+                       if (DVA_IS_VALID(dva))                          \
+                               copies++;                               \
+                       len += func(buf + len, size - len,              \
+                           "DVA[%d]=<%llu:%llx:%llx>%c", d,            \
+                           (u_longlong_t)DVA_GET_VDEV(dva),            \
+                           (u_longlong_t)DVA_GET_OFFSET(dva),          \
+                           (u_longlong_t)DVA_GET_ASIZE(dva),           \
+                           ws);                                        \
+               }                                                       \
+               if (BP_IS_GANG(bp) &&                                   \
+                   DVA_GET_ASIZE(&bp->blk_dva[2]) <=                   \
+                   DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)                 \
+                       copies--;                                       \
+               len += func(buf + len, size - len,                      \
+                   "[L%llu %s] %s %s %s %s %s %s%c"                    \
+                   "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"    \
+                   "cksum=%llx:%llx:%llx:%llx",                        \
+                   (u_longlong_t)BP_GET_LEVEL(bp),                     \
+                   type,                                               \
+                   checksum,                                           \
+                   compress,                                           \
+                   BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",            \
+                   BP_IS_GANG(bp) ? "gang" : "contiguous",             \
+                   BP_GET_DEDUP(bp) ? "dedup" : "unique",              \
+                   copyname[copies],                                   \
+                   ws,                                                 \
+                   (u_longlong_t)BP_GET_LSIZE(bp),                     \
+                   (u_longlong_t)BP_GET_PSIZE(bp),                     \
+                   (u_longlong_t)bp->blk_birth,                        \
+                   (u_longlong_t)BP_PHYSICAL_BIRTH(bp),                \
+                   (u_longlong_t)bp->blk_fill,                         \
+                   ws,                                                 \
+                   (u_longlong_t)bp->blk_cksum.zc_word[0],             \
+                   (u_longlong_t)bp->blk_cksum.zc_word[1],             \
+                   (u_longlong_t)bp->blk_cksum.zc_word[2],             \
+                   (u_longlong_t)bp->blk_cksum.zc_word[3]);            \
+       }                                                               \
+       ASSERT(len < size);                                             \
+}
+
 #include <sys/dmu.h>
 
 #define        BP_GET_BUFC_TYPE(bp)                                            \
        (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
        ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
+
+typedef enum spa_import_type {
+       SPA_IMPORT_EXISTING,
+       SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+    nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config,
     char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
@@ -338,6 +431,8 @@ extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 
 #define        SPA_ASYNC_CONFIG_UPDATE 0x01
 #define        SPA_ASYNC_REMOVE        0x02
@@ -345,6 +440,14 @@ extern void spa_inject_delref(spa_t *spa);
 #define        SPA_ASYNC_RESILVER_DONE 0x08
 #define        SPA_ASYNC_RESILVER      0x10
 #define        SPA_ASYNC_AUTOEXPAND    0x20
+#define        SPA_ASYNC_REMOVE_DONE   0x40
+#define        SPA_ASYNC_REMOVE_STOP   0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define        SPA_REMOVE_UNSPARE      0x01
+#define        SPA_REMOVE_DONE         0x02
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -353,8 +456,11 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+    nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
@@ -368,15 +474,23 @@ extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
 
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred.  XXX so can't we change it back to 1?
+ */
+#define        SYNC_PASS_DEFERRED_FREE 2       /* defer frees after this pass */
+#define        SYNC_PASS_DONT_COMPRESS 4       /* don't compress after this pass */
+#define        SYNC_PASS_REWRITE       1       /* rewrite new bps after this pass */
+
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
@@ -394,7 +508,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
@@ -402,7 +515,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
@@ -411,6 +524,7 @@ extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
+#define        SCL_NONE        0x00
 #define        SCL_CONFIG      0x01
 #define        SCL_STATE       0x02
 #define        SCL_L2ARC       0x04            /* hack until L2ARC 2.0 */
@@ -430,12 +544,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+    int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
+/* Log state */
+typedef enum spa_log_state {
+       SPA_LOG_UNKNOWN = 0,    /* unknown log state */
+       SPA_LOG_MISSING,        /* missing log(s) */
+       SPA_LOG_CLEAR,          /* clear the log(s) */
+       SPA_LOG_GOOD,           /* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_offline_log(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@@ -447,18 +579,26 @@ extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
@@ -466,18 +606,24 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
+extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
+
 extern int spa_mode(spa_t *spa);
+extern uint64_t strtonum(const char *str, char **nptr);
 
 /* history logging */
 typedef enum history_log_type {
@@ -487,10 +633,11 @@ typedef enum history_log_type {
 } history_log_type_t;
 
 typedef struct history_arg {
-       const char *ha_history_str;
+       char *ha_history_str;
        history_log_type_t ha_log_type;
        history_internal_events_t ha_event;
-       char ha_zone[MAXPATHLEN];
+       char *ha_zone;
+       uid_t ha_uid;
 } history_arg_t;
 
 extern char *spa_his_ievent_table[];
@@ -500,17 +647,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf,
     history_log_type_t what);
-extern void spa_history_internal_log(history_internal_events_t event,
-    spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_internal(history_internal_events_t event,
+    spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
 extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
 
 /* error handling */
 struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
 extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t stateoroffset, uint64_t length);
+    zio_t *zio, uint64_t stateoroffset, uint64_t length);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
@@ -541,7 +688,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
 #define        dprintf_bp(bp, fmt, ...) do {                           \
        if (zfs_flags & ZFS_DEBUG_DPRINTF) {                    \
        char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);  \
-       sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));         \
+       sprintf_blkptr(__blkbuf, (bp));                         \
        dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);            \
        kmem_free(__blkbuf, BP_SPRINTF_LEN);                    \
        } \
index 84da684..e2e1851 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -36,6 +35,7 @@
 #include <sys/avl.h>
 #include <sys/refcount.h>
 #include <sys/bplist.h>
+#include <sys/bpobj.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -78,19 +78,33 @@ typedef struct spa_config_dirent {
        char            *scd_path;
 } spa_config_dirent_t;
 
-typedef enum spa_log_state {
-       SPA_LOG_UNKNOWN = 0,    /* unknown log state */
-       SPA_LOG_MISSING,        /* missing log(s) */
-       SPA_LOG_CLEAR,          /* clear the log(s) */
-       SPA_LOG_GOOD,           /* log(s) are good */
-} spa_log_state_t;
-
 enum zio_taskq_type {
        ZIO_TASKQ_ISSUE = 0,
+       ZIO_TASKQ_ISSUE_HIGH,
        ZIO_TASKQ_INTERRUPT,
+       ZIO_TASKQ_INTERRUPT_HIGH,
        ZIO_TASKQ_TYPES
 };
 
+/*
+ * State machine for the zpool-pooname process.  The states transitions
+ * are done as follows:
+ *
+ *     From               To                   Routine
+ *     PROC_NONE       -> PROC_CREATED         spa_activate()
+ *     PROC_CREATED    -> PROC_ACTIVE          spa_thread()
+ *     PROC_ACTIVE     -> PROC_DEACTIVATE      spa_deactivate()
+ *     PROC_DEACTIVATE -> PROC_GONE            spa_thread()
+ *     PROC_GONE       -> PROC_NONE            spa_deactivate()
+ */
+typedef enum spa_proc_state {
+       SPA_PROC_NONE,          /* spa_proc = &p0, no process created */
+       SPA_PROC_CREATED,       /* spa_activate() has proc, is waiting */
+       SPA_PROC_ACTIVE,        /* taskqs created, spa_proc set */
+       SPA_PROC_DEACTIVATE,    /* spa_deactivate() requests process exit */
+       SPA_PROC_GONE           /* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
 struct spa {
        /*
         * Fields protected by spa_namespace_lock.
@@ -99,6 +113,7 @@ struct spa {
        avl_node_t      spa_avl;                /* node in spa_namespace_avl */
        nvlist_t        *spa_config;            /* last synced config */
        nvlist_t        *spa_config_syncing;    /* currently syncing config */
+       nvlist_t        *spa_config_splitting;  /* config for splitting */
        uint64_t        spa_config_txg;         /* txg of last config change */
        int             spa_sync_pass;          /* iterate-to-convergence */
        pool_state_t    spa_state;              /* pool state */
@@ -113,6 +128,8 @@ struct spa {
        uint64_t        spa_first_txg;          /* first txg after spa_open() */
        uint64_t        spa_final_txg;          /* txg of export/destroy */
        uint64_t        spa_freeze_txg;         /* freeze pool at this txg */
+       uint64_t        spa_load_max_txg;       /* best initial ub_txg */
+       uint64_t        spa_claim_max_txg;      /* highest claimed birth txg */
        objset_t        *spa_meta_objset;       /* copy of dp->dp_meta_objset */
        txg_list_t      spa_vdev_txg_list;      /* per-txg dirty vdev list */
        vdev_t          *spa_root_vdev;         /* top-level vdev container */
@@ -122,21 +139,24 @@ struct spa {
        spa_aux_vdev_t  spa_spares;             /* hot spares */
        spa_aux_vdev_t  spa_l2cache;            /* L2ARC cache devices */
        uint64_t        spa_config_object;      /* MOS object for pool config */
+       uint64_t        spa_config_generation;  /* config generation number */
        uint64_t        spa_syncing_txg;        /* txg currently syncing */
-       uint64_t        spa_sync_bplist_obj;    /* object for deferred frees */
-       bplist_t        spa_sync_bplist;        /* deferred-free bplist */
+       bpobj_t         spa_deferred_bpobj;     /* deferred-free bplist */
+       bplist_t        spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
        uberblock_t     spa_ubsync;             /* last synced uberblock */
        uberblock_t     spa_uberblock;          /* current uberblock */
+       boolean_t       spa_extreme_rewind;     /* rewind past deferred frees */
        kmutex_t        spa_scrub_lock;         /* resilver/scrub lock */
        uint64_t        spa_scrub_inflight;     /* in-flight scrub I/Os */
        uint64_t        spa_scrub_maxinflight;  /* max in-flight scrub I/Os */
-       uint64_t        spa_scrub_errors;       /* scrub I/O error count */
        kcondvar_t      spa_scrub_io_cv;        /* scrub I/O completion */
        uint8_t         spa_scrub_active;       /* active or suspended? */
        uint8_t         spa_scrub_type;         /* type of scrub we're doing */
        uint8_t         spa_scrub_finished;     /* indicator to rotate logs */
        uint8_t         spa_scrub_started;      /* started since last boot */
        uint8_t         spa_scrub_reopen;       /* scrub doing vdev_reopen */
+       uint64_t        spa_scan_pass_start;    /* start time per pass/reboot */
+       uint64_t        spa_scan_pass_exam;     /* examined bytes per pass */
        kmutex_t        spa_async_lock;         /* protect async state */
        kthread_t       *spa_async_thread;      /* thread doing async task */
        int             spa_async_suspended;    /* async tasks suspended */
@@ -144,7 +164,14 @@ struct spa {
        uint16_t        spa_async_tasks;        /* async task mask */
        char            *spa_root;              /* alternate root directory */
        uint64_t        spa_ena;                /* spa-wide ereport ENA */
-       boolean_t       spa_last_open_failed;   /* true if last open faled */
+       int             spa_last_open_failed;   /* error if last open failed */
+       uint64_t        spa_last_ubsync_txg;    /* "best" uberblock txg */
+       uint64_t        spa_last_ubsync_txg_ts; /* timestamp from that ub */
+       uint64_t        spa_load_txg;           /* ub txg that loaded */
+       uint64_t        spa_load_txg_ts;        /* timestamp from that ub */
+       uint64_t        spa_load_meta_errors;   /* verify metadata err count */
+       uint64_t        spa_load_data_errors;   /* verify data err count */
+       uint64_t        spa_verify_min_txg;     /* start txg of verify scrub */
        kmutex_t        spa_errlog_lock;        /* error log lock */
        uint64_t        spa_errlog_last;        /* last error log object */
        uint64_t        spa_errlog_scrub;       /* scrub error log object */
@@ -166,11 +193,27 @@ struct spa {
        kmutex_t        spa_suspend_lock;       /* protects suspend_zio_root */
        kcondvar_t      spa_suspend_cv;         /* notification of resume */
        uint8_t         spa_suspended;          /* pool is suspended */
+       uint8_t         spa_claiming;           /* pool is doing zil_claim() */
        boolean_t       spa_is_root;            /* pool is root */
        int             spa_minref;             /* num refs when first opened */
        int             spa_mode;               /* FREAD | FWRITE */
        spa_log_state_t spa_log_state;          /* log state */
        uint64_t        spa_autoexpand;         /* lun expansion on/off */
+       ddt_t           *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+       uint64_t        spa_ddt_stat_object;    /* DDT statistics */
+       uint64_t        spa_dedup_ditto;        /* dedup ditto threshold */
+       uint64_t        spa_dedup_checksum;     /* default dedup checksum */
+       uint64_t        spa_dspace;             /* dspace in normal class */
+       kmutex_t        spa_vdev_top_lock;      /* dueling offline/remove */
+       kmutex_t        spa_proc_lock;          /* protects spa_proc* */
+       kcondvar_t      spa_proc_cv;            /* spa_proc_state transitions */
+       spa_proc_state_t spa_proc_state;        /* see definition */
+       struct proc     *spa_proc;              /* "zpool-poolname" process */
+       uint64_t        spa_did;                /* if procp != p0, did of t1 */
+       boolean_t       spa_autoreplace;        /* autoreplace set in open */
+       int             spa_vdev_locks;         /* locks grabbed */
+       uint64_t        spa_creation_version;   /* version at pool creation */
+       uint64_t        spa_prev_software_version;
        /*
         * spa_refcnt & spa_config_lock must be the last elements
         * because refcount_t changes size based on compilation options.
@@ -183,12 +226,6 @@ struct spa {
 
 extern const char *spa_config_path;
 
-#define        BOOTFS_COMPRESS_VALID(compress) \
-       ((compress) == ZIO_COMPRESS_LZJB || \
-       ((compress) == ZIO_COMPRESS_ON && \
-       ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
-       (compress) == ZIO_COMPRESS_OFF)
-
 #ifdef __cplusplus
 }
 #endif
index a682bbd..6f935c9 100644 (file)
@@ -77,6 +77,7 @@ struct space_map_ops {
        void    (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
        void    (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
        uint64_t (*smop_max)(space_map_t *sm);
+       boolean_t (*smop_fragmented)(space_map_t *sm);
 };
 
 /*
index 23bdff2..e323d5e 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_TXG_H
 #define        _SYS_TXG_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/spa.h>
 #include <sys/zfs_context.h>
 
@@ -41,6 +39,9 @@ extern "C" {
 #define        TXG_INITIAL             TXG_SIZE        /* initial txg          */
 #define        TXG_IDX                 (txg & TXG_MASK)
 
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define        TXG_DEFER_SIZE          2
+
 #define        TXG_WAIT                1ULL
 #define        TXG_NOWAIT              2ULL
 
@@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp);
 extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
 extern void txg_rele_to_quiesce(txg_handle_t *txghp);
 extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
+extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
 /*
  * Delay the caller by the specified number of ticks or until
@@ -117,6 +117,7 @@ extern void txg_list_create(txg_list_t *tl, size_t offset);
 extern void txg_list_destroy(txg_list_t *tl);
 extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
 extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
 extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
 extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
index 7413c66..7b356ea 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,13 +37,13 @@ struct tx_cpu {
        kmutex_t        tc_lock;
        kcondvar_t      tc_cv[TXG_SIZE];
        uint64_t        tc_count[TXG_SIZE];
+       list_t          tc_callbacks[TXG_SIZE]; /* commit cb list */
        char            tc_pad[16];
 };
 
 typedef struct tx_state {
        tx_cpu_t        *tx_cpu;        /* protects right to enter txg  */
        kmutex_t        tx_sync_lock;   /* protects tx_state_t */
-       krwlock_t       tx_suspend;
        uint64_t        tx_open_txg;    /* currently open txg id */
        uint64_t        tx_quiesced_txg; /* quiesced txg waiting for sync */
        uint64_t        tx_syncing_txg; /* currently syncing txg id */
@@ -64,6 +64,8 @@ typedef struct tx_state {
 
        kthread_t       *tx_sync_thread;
        kthread_t       *tx_quiesce_thread;
+
+       taskq_t         *tx_commit_cb_taskq; /* commit callback taskq */
 } tx_state_t;
 
 #ifdef __cplusplus
index 93d936a..b5bb915 100644 (file)
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_UBERBLOCK_H
 #define        _SYS_UBERBLOCK_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/spa.h>
 #include <sys/vdev.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 
 #ifdef __cplusplus
 extern "C" {
index b49df8a..6ab6aa3 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
@@ -33,11 +32,6 @@ extern "C" {
 #endif
 
 /*
- * For zdb use and debugging purposes only
- */
-extern uint64_t ub_max_txg;
-
-/*
  * The uberblock version is incremented whenever an incompatible on-disk
  * format change is made to the SPA, DMU, or ZAP.
  *
@@ -57,6 +51,9 @@ struct uberblock {
        uint64_t        ub_guid_sum;    /* sum of all vdev guids        */
        uint64_t        ub_timestamp;   /* UTC time of last sync        */
        blkptr_t        ub_rootbp;      /* MOS objset_phys_t            */
+
+       /* highest SPA_VERSION supported by software that wrote this txg */
+       uint64_t        ub_software_version;
 };
 
 #ifdef __cplusplus
index 7e53f62..941f234 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
@@ -47,7 +46,8 @@ typedef enum vdev_dtl_type {
 extern boolean_t zfs_nocacheflush;
 
 extern int vdev_open(vdev_t *);
-extern void vdev_open_children(vdev_t *vd);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
 extern int vdev_validate(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
@@ -69,28 +69,31 @@ extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
 
+extern void vdev_hold(vdev_t *);
+extern void vdev_rele(vdev_t *);
+
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
+extern void vdev_split(vdev_t *vd);
+
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
-    boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
-    int64_t alloc_delta, boolean_t update_root);
+extern void vdev_space_update(vdev_t *vd,
+    int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
-extern int vdev_fault(spa_t *spa, uint64_t guid);
-extern int vdev_degrade(spa_t *spa, uint64_t guid);
+extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
     vdev_state_t *);
 extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
@@ -121,8 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
+typedef enum vdev_config_flag {
+       VDEV_CONFIG_SPARE = 1 << 0,
+       VDEV_CONFIG_L2CACHE = 1 << 1,
+       VDEV_CONFIG_REMOVING = 1 << 2
+} vdev_config_flag_t;
+
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
-    boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
+    boolean_t getstats, vdev_config_flag_t flags);
 
 /*
  * Label routines
@@ -138,7 +148,8 @@ typedef enum {
        VDEV_LABEL_REPLACE,     /* replace an existing device */
        VDEV_LABEL_SPARE,       /* add a new hot spare */
        VDEV_LABEL_REMOVE,      /* remove an existing device */
-       VDEV_LABEL_L2CACHE      /* add an L2ARC cache device */
+       VDEV_LABEL_L2CACHE,     /* add an L2ARC cache device */
+       VDEV_LABEL_SPLIT        /* generating new label for split-off dev */
 } vdev_labeltype_t;
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
index 2378043..2b886bc 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -62,6 +61,8 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
 typedef int    vdev_io_start_func_t(zio_t *zio);
 typedef void   vdev_io_done_func_t(zio_t *zio);
 typedef void   vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef void   vdev_hold_func_t(vdev_t *vd);
+typedef void   vdev_rele_func_t(vdev_t *vd);
 
 typedef struct vdev_ops {
        vdev_open_func_t                *vdev_op_open;
@@ -70,6 +71,8 @@ typedef struct vdev_ops {
        vdev_io_start_func_t            *vdev_op_io_start;
        vdev_io_done_func_t             *vdev_op_io_done;
        vdev_state_change_func_t        *vdev_op_state_change;
+       vdev_hold_func_t                *vdev_op_hold;
+       vdev_rele_func_t                *vdev_op_rele;
        char                            vdev_op_type[16];
        boolean_t                       vdev_op_leaf;
 } vdev_ops_t;
@@ -112,6 +115,7 @@ struct vdev {
        uint64_t        vdev_id;        /* child number in vdev parent  */
        uint64_t        vdev_guid;      /* unique ID for this vdev      */
        uint64_t        vdev_guid_sum;  /* self guid + all child guids  */
+       uint64_t        vdev_orig_guid; /* orig. guid prior to remove   */
        uint64_t        vdev_asize;     /* allocatable device capacity  */
        uint64_t        vdev_min_asize; /* min acceptable asize         */
        uint64_t        vdev_ashift;    /* block alignment shift        */
@@ -120,6 +124,8 @@ struct vdev {
        vdev_ops_t      *vdev_ops;      /* vdev operations              */
        spa_t           *vdev_spa;      /* spa for this vdev            */
        void            *vdev_tsd;      /* type-specific data           */
+       vnode_t         *vdev_name_vp;  /* vnode for pathname           */
+       vnode_t         *vdev_devid_vp; /* vnode for devid              */
        vdev_t          *vdev_top;      /* top-level vdev               */
        vdev_t          *vdev_parent;   /* parent vdev                  */
        vdev_t          **vdev_child;   /* array of children            */
@@ -127,8 +133,10 @@ struct vdev {
        space_map_t     vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
        vdev_stat_t     vdev_stat;      /* virtual device statistics    */
        boolean_t       vdev_expanding; /* expand the vdev?             */
+       boolean_t       vdev_reopening; /* reopen in progress?          */
        int             vdev_open_error; /* error on last open          */
        kthread_t       *vdev_open_thread; /* thread opening children   */
+       uint64_t        vdev_crtxg;     /* txg when top-level was added */
 
        /*
         * Top-level vdev state.
@@ -143,10 +151,12 @@ struct vdev {
        txg_node_t      vdev_txg_node;  /* per-txg dirty vdev linkage   */
        boolean_t       vdev_remove_wanted; /* async remove wanted?     */
        boolean_t       vdev_probe_wanted; /* async probe wanted?       */
+       uint64_t        vdev_removing;  /* device is being removed?     */
        list_node_t     vdev_config_dirty_node; /* config dirty list    */
        list_node_t     vdev_state_dirty_node; /* state dirty list      */
        uint64_t        vdev_deflate_ratio; /* deflation ratio (x512)   */
        uint64_t        vdev_islog;     /* is an intent log device      */
+       uint64_t        vdev_ishole;    /* is a hole in the namespace   */
 
        /*
         * Leaf vdev state.
@@ -170,6 +180,8 @@ struct vdev {
        boolean_t       vdev_nowritecache; /* true if flushwritecache failed */
        boolean_t       vdev_checkremove; /* temporary online test      */
        boolean_t       vdev_forcefault; /* force online fault          */
+       boolean_t       vdev_splitting; /* split or repair in progress  */
+       boolean_t       vdev_delayed_close; /* delayed device close?    */
        uint8_t         vdev_tmpoffline; /* device taken offline temporarily? */
        uint8_t         vdev_detached;  /* device detached?             */
        uint8_t         vdev_cant_read; /* vdev is failing all reads    */
@@ -180,6 +192,7 @@ struct vdev {
        vdev_cache_t    vdev_cache;     /* physical block cache         */
        spa_aux_vdev_t  *vdev_aux;      /* for l2cache vdevs            */
        zio_t           *vdev_probe_zio; /* root of current probe       */
+       vdev_aux_t      vdev_label_aux; /* on-disk aux state            */
 
        /*
         * For DTrace to work in userland (libzpool) context, these fields must
@@ -193,6 +206,8 @@ struct vdev {
        kmutex_t        vdev_probe_lock; /* protects vdev_probe_zio     */
 };
 
+#define        VDEV_RAIDZ_MAXPARITY    3
+
 #define        VDEV_PAD_SIZE           (8 << 10)
 /* 2 padding areas (vl_pad1 and vl_pad2) to skip */
 #define        VDEV_SKIP_SIZE          VDEV_PAD_SIZE * 2
@@ -208,8 +223,8 @@ struct vdev {
 #define        VDEV_UBERBLOCK_SIZE(vd)         (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
 typedef struct vdev_phys {
-       char            vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
-       zio_block_tail_t vp_zbt;
+       char            vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+       zio_eck_t       vp_zbt;
 } vdev_phys_t;
 
 typedef struct vdev_label {
@@ -244,10 +259,13 @@ typedef struct vdev_label {
 #define        VDEV_ALLOC_SPARE        2
 #define        VDEV_ALLOC_L2CACHE      3
 #define        VDEV_ALLOC_ROOTPOOL     4
+#define        VDEV_ALLOC_SPLIT        5
 
 /*
  * Allocate or free a vdev
  */
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+    vdev_ops_t *ops);
 extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
     vdev_t *parent, uint_t id, int alloctype);
 extern void vdev_free(vdev_t *vd);
@@ -264,7 +282,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
 /*
  * vdev sync load and sync
  */
-extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
 extern void vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -280,6 +298,7 @@ extern vdev_ops_t vdev_raidz_ops;
 extern vdev_ops_t vdev_disk_ops;
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
 extern vdev_ops_t vdev_spare_ops;
 
 /*
index 967174b..a1130bb 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_H
@@ -101,6 +100,18 @@ typedef enum matchtype
        MT_FIRST
 } matchtype_t;
 
+typedef enum zap_flags {
+       /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+       ZAP_FLAG_HASH64 = 1 << 0,
+       /* Key is binary, not string (zap_add_uint64() can be used) */
+       ZAP_FLAG_UINT64_KEY = 1 << 1,
+       /*
+        * First word of key (which must be an array of uint64) is
+        * already randomly distributed.
+        */
+       ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
 /*
  * Create a new zapobj with no attributes and return its object number.
  * MT_EXACT will cause the zap object to only support MT_EXACT lookups,
@@ -118,6 +129,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
@@ -180,6 +194,11 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints);
 
 int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
     int add, uint64_t *towrite, uint64_t *tooverwrite);
@@ -190,9 +209,12 @@ int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
  * If an attribute with the given name already exists, the call will
  * fail and return EEXIST.
  */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -204,6 +226,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
  */
 int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -214,6 +239,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
  */
 int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers);
 
 /*
  * Remove the specified attribute.
@@ -224,6 +251,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
 int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
@@ -231,7 +260,6 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
  */
 int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
 
-
 /*
  * Returns (in name) the name of the entry whose (value & mask)
  * (za_first_integer) is value, or ENOENT if not found.  The string
@@ -248,6 +276,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj,
  */
 int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
 
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx);
+
 /*
  * Manipulate entries where the name + value are the "same" (the name is
  * a stringified version of the value).
@@ -255,6 +291,23 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
 int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx);
+
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t *valuep);
+
+/*
+ * They name is a stringified version of key; increment its value by
+ * delta.  Zero values will be zap_remove()-ed.
+ */
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx);
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx);
 
 struct zap;
 struct zap_leaf;
@@ -264,6 +317,7 @@ typedef struct zap_cursor {
        struct zap *zc_zap;
        struct zap_leaf *zc_leaf;
        uint64_t zc_zapobj;
+       uint64_t zc_serialized;
        uint64_t zc_hash;
        uint32_t zc_cd;
 } zap_cursor_t;
@@ -315,6 +369,11 @@ void zap_cursor_advance(zap_cursor_t *zc);
 uint64_t zap_cursor_serialize(zap_cursor_t *zc);
 
 /*
+ * Advance the cursor to the attribute having the given key.
+ */
+int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
+
+/*
  * Initialize a zap cursor pointing to the position recorded by
  * zap_cursor_serialize (in the "serialized" argument).  You can also
  * use a "serialized" argument of 0 to start at the beginning of the
index c86bb16..1dc322e 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_IMPL_H
@@ -40,13 +39,13 @@ extern int fzap_default_block_shift;
 
 #define        FZAP_BLOCK_SHIFT(zap)   ((zap)->zap_f.zap_block_shift)
 
-#define        ZAP_MAXCD               (uint32_t)(-1)
-#define        ZAP_HASHBITS            28
 #define        MZAP_ENT_LEN            64
 #define        MZAP_NAME_LEN           (MZAP_ENT_LEN - 8 - 4 - 2)
 #define        MZAP_MAX_BLKSHIFT       SPA_MAXBLOCKSHIFT
 #define        MZAP_MAX_BLKSZ          (1 << MZAP_MAX_BLKSHIFT)
 
+#define        ZAP_NEED_CD             (-1U)
+
 typedef struct mzap_ent_phys {
        uint64_t mze_value;
        uint32_t mze_cd;
@@ -67,9 +66,11 @@ typedef struct mzap_ent {
        avl_node_t mze_node;
        int mze_chunkid;
        uint64_t mze_hash;
-       mzap_ent_phys_t mze_phys;
+       uint32_t mze_cd; /* copy from mze_phys->mze_cd */
 } mzap_ent_t;
 
+#define        MZE_PHYS(zap, mze) \
+       (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
 
 /*
  * The (fat) zap is stored in one object. It is an array of
@@ -127,6 +128,7 @@ typedef struct zap_phys {
        uint64_t zap_num_entries;       /* number of entries */
        uint64_t zap_salt;              /* salt to stir into hash function */
        uint64_t zap_normflags;         /* flags for u8_textprep_str() */
+       uint64_t zap_flags;             /* zap_flags_t */
        /*
         * This structure is followed by padding, and then the embedded
         * pointer table.  The embedded pointer table takes up second
@@ -168,10 +170,13 @@ typedef struct zap {
 
 typedef struct zap_name {
        zap_t *zn_zap;
-       const char *zn_name_orij;
+       int zn_key_intlen;
+       const void *zn_key_orig;
+       int zn_key_orig_numints;
+       const void *zn_key_norm;
+       int zn_key_norm_numints;
        uint64_t zn_hash;
        matchtype_t zn_matchtype;
-       const char *zn_name_norm;
        char zn_normbuf[ZAP_MAXNAMELEN];
 } zap_name_t;
 
@@ -183,8 +188,11 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
 void zap_evict(dmu_buf_t *db, void *vmzap);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
 
 #define        ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
 
@@ -193,6 +201,7 @@ int fzap_count(zap_t *zap, uint64_t *count);
 int fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
+void fzap_prefetch(zap_name_t *zn);
 int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
     uint64_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
@@ -209,7 +218,8 @@ void zap_put_leaf(struct zap_leaf *l);
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
+int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
 #ifdef __cplusplus
 }
index 14144e0..3a33636 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_LEAF_H
 #define        _SYS_ZAP_LEAF_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
+#include <sys/zap.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct zap;
+struct zap_name;
+struct zap_stats;
 
 #define        ZAP_LEAF_MAGIC 0x2AB1EAF
 
@@ -129,12 +130,12 @@ typedef struct zap_leaf_phys {
 typedef union zap_leaf_chunk {
        struct zap_leaf_entry {
                uint8_t le_type;                /* always ZAP_CHUNK_ENTRY */
-               uint8_t le_int_size;            /* size of ints */
+               uint8_t le_value_intlen;        /* size of value's ints */
                uint16_t le_next;               /* next entry in hash chain */
                uint16_t le_name_chunk;         /* first chunk of the name */
-               uint16_t le_name_length;        /* bytes in name, incl null */
+               uint16_t le_name_numints;       /* ints in name (incl null) */
                uint16_t le_value_chunk;        /* first chunk of the value */
-               uint16_t le_value_length;       /* value length in ints */
+               uint16_t le_value_numints;      /* value length in ints */
                uint32_t le_cd;                 /* collision differentiator */
                uint64_t le_hash;               /* hash value of the name */
        } l_entry;
@@ -177,7 +178,7 @@ typedef struct zap_entry_handle {
  * value must equal zap_hash(name).
  */
 extern int zap_leaf_lookup(zap_leaf_t *l,
-    zap_name_t *zn, zap_entry_handle_t *zeh);
+    struct zap_name *zn, zap_entry_handle_t *zeh);
 
 /*
  * Return a handle to the entry with this hash+cd, or the entry with the
@@ -193,10 +194,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l,
  * num_integers in the attribute.
  */
 extern int zap_entry_read(const zap_entry_handle_t *zeh,
-       uint8_t integer_size, uint64_t num_integers, void *buf);
+    uint8_t integer_size, uint64_t num_integers, void *buf);
 
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
-       uint16_t buflen, char *buf);
+extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
+    uint16_t buflen, char *buf);
 
 /*
  * Replace the value of an existing entry.
@@ -204,7 +205,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
  * zap_entry_update may fail if it runs out of space (ENOSPC).
  */
 extern int zap_entry_update(zap_entry_handle_t *zeh,
-       uint8_t integer_size, uint64_t num_integers, const void *buf);
+    uint8_t integer_size, uint64_t num_integers, const void *buf);
 
 /*
  * Remove an entry.
@@ -216,17 +217,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh);
  * belong in this leaf (according to its hash value).  Fills in the
  * entry handle on success.  Returns 0 on success or ENOSPC on failure.
  */
-extern int zap_entry_create(zap_leaf_t *l,
-       const char *name, uint64_t h, uint32_t cd,
-       uint8_t integer_size, uint64_t num_integers, const void *buf,
-       zap_entry_handle_t *zeh);
+extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
+    uint8_t integer_size, uint64_t num_integers, const void *buf,
+    zap_entry_handle_t *zeh);
 
 /*
  * Return true if there are additional entries with the same normalized
  * form.
  */
 extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
-    zap_name_t *zn, const char *name, zap_t *zap);
+    struct zap_name *zn, const char *name, struct zap *zap);
 
 /*
  * Other stuff.
@@ -235,7 +235,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
 extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
-extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
+    struct zap_stats *zs);
 
 #ifdef __cplusplus
 }
index 3488962..72e868f 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_FS_ZFS_ACL_H
@@ -33,6 +32,7 @@
 #include <sys/acl.h>
 #include <sys/dmu.h>
 #include <sys/zfs_fuid.h>
+#include <sys/sa.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -106,12 +106,18 @@ typedef struct zfs_acl_phys_v0 {
 
 #define        ZFS_ACE_SPACE   (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
 
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define        ZFS_ACL_COUNT_SIZE      (sizeof (uint16_t))
+
 typedef struct zfs_acl_phys {
        uint64_t        z_acl_extern_obj;         /* ext acl pieces */
        uint32_t        z_acl_size;               /* Number of bytes in ACL */
        uint16_t        z_acl_version;            /* acl version */
        uint16_t        z_acl_count;              /* ace count */
-       uint8_t         z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+       uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
 } zfs_acl_phys_t;
 
 typedef struct acl_ops {
@@ -146,21 +152,26 @@ typedef struct zfs_acl_node {
        void            *z_allocdata;   /* pointer to kmem allocated memory */
        size_t          z_allocsize;    /* Size of blob in bytes */
        size_t          z_size;         /* length of ACL data */
-       int             z_ace_count;    /* number of ACEs in this acl node */
+       uint64_t        z_ace_count;    /* number of ACEs in this acl node */
        int             z_ace_idx;      /* ace iterator positioned on */
 } zfs_acl_node_t;
 
 typedef struct zfs_acl {
-       int             z_acl_count;    /* Number of ACEs */
+       uint64_t        z_acl_count;    /* Number of ACEs */
        size_t          z_acl_bytes;    /* Number of bytes in ACL */
        uint_t          z_version;      /* version of ACL */
        void            *z_next_ace;    /* pointer to next ACE */
-       int             z_hints;        /* ACL hints (ZFS_INHERIT_ACE ...) */
+       uint64_t        z_hints;        /* ACL hints (ZFS_INHERIT_ACE ...) */
        zfs_acl_node_t  *z_curr_node;   /* current node iterator is handling */
        list_t          z_acl;          /* chunks of ACE data */
        acl_ops_t       z_ops;          /* ACL operations */
 } zfs_acl_t;
 
+typedef struct acl_locator_cb {
+       zfs_acl_t *cb_aclp;
+       zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
 #define        ACL_DATA_ALLOCED        0x1
 #define        ZFS_ACL_SIZE(aclcnt)    (sizeof (ace_t) * (aclcnt))
 
@@ -174,6 +185,10 @@ typedef struct zfs_acl_ids {
        struct zfs_fuid_info    *z_fuidp;       /* for tracking fuids for log */
 } zfs_acl_ids_t;
 
+#define        ZFS_EXTERNAL_ACL(zp) \
+       (zp->z_is_sa ? 0 : zfs_external_acl(zp))
+#define        ZNODE_ACL_VERSION(zp) \
+       (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp))
 /*
  * Property values for acl_mode and acl_inherit.
  *
@@ -215,6 +230,16 @@ void zfs_acl_free(zfs_acl_t *);
 int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
     struct zfs_fuid_info **, zfs_acl_t **);
 int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
+    uint64_t *, uint64_t, uint64_t);
+int zfs_acl_chown_setattr(struct znode *);
 
 #endif
 
index 40de320..558e9e1 100644 (file)
@@ -62,6 +62,7 @@ extern "C" {
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sysevent/dev.h>
 #include <sys/fm/util.h>
+#include <sys/sunddi.h>
 
 #define        CPU_SEQID       (CPU->cpu_seqid)
 
index c15c946..f88ef95 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _ZFS_CTLDIR_H
@@ -49,6 +48,7 @@ void zfsctl_destroy(zfsvfs_t *);
 vnode_t *zfsctl_root(znode_t *);
 void zfsctl_init(void);
 void zfsctl_fini(void);
+boolean_t zfsctl_is_node(vnode_t *);
 
 int zfsctl_rename_snapshot(const char *from, const char *to);
 int zfsctl_destroy_snapshot(const char *snapname, int force);
index 450ac1c..50ecf9b 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_DEBUG_H
 #define        _SYS_ZFS_DEBUG_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func,
 
 extern void zfs_panic_recover(const char *fmt, ...);
 
+typedef struct zfs_dbgmsg {
+       list_node_t zdm_node;
+       time_t zdm_timestamp;
+       char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+
 #ifdef __cplusplus
 }
 #endif
index 650315b..349f8ef 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,11 +42,11 @@ extern "C" {
 #define        ZRENAMING       0x0010          /* znode is being renamed */
 #define        ZCILOOK         0x0020          /* case-insensitive lookup requested */
 #define        ZCIEXACT        0x0040          /* c-i requires c-s match (rename) */
+#define        ZHAVELOCK       0x0080          /* z_name_lock is already held */
 
 /* mknode flags */
 #define        IS_ROOT_NODE    0x01            /* create a root node */
 #define        IS_XATTR        0x02            /* create an extended attribute node */
-#define        IS_REPLAY       0x04            /* we are replaying intent log */
 
 extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
     int, int *, pathname_t *);
@@ -57,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
 extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
     pathname_t *);
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
-    uint_t, znode_t **, int, zfs_acl_ids_t *);
+    uint_t, znode_t **, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
 extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
index f81ddf4..0feb3ce 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -100,6 +100,8 @@ typedef struct zfs_fuid_info {
 #ifdef _KERNEL
 struct znode;
 extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
+    uint64_t, uint64_t, zfs_fuid_type_t);
 extern void zfs_fuid_destroy(zfsvfs_t *);
 extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
     cred_t *, zfs_fuid_info_t **);
index 3a3e6e7..b0cb495 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,6 +30,7 @@
 #include <sys/dmu.h>
 #include <sys/zio.h>
 #include <sys/dsl_deleg.h>
+#include <sys/spa.h>
 
 #ifdef _KERNEL
 #include <sys/nvpair.h>
@@ -45,26 +46,86 @@ extern "C" {
 #define        ZFS_SNAPDIR_HIDDEN              0
 #define        ZFS_SNAPDIR_VISIBLE             1
 
-#define        DMU_BACKUP_STREAM_VERSION (1ULL)
-#define        DMU_BACKUP_HEADER_VERSION (2ULL)
+/*
+ * Field manipulation macros for the drr_versioninfo field of the
+ * send stream header.
+ */
+
+/*
+ * Header types for zfs send streams.
+ */
+typedef enum drr_headertype {
+       DMU_SUBSTREAM = 0x1,
+       DMU_COMPOUNDSTREAM = 0x2
+} drr_headertype_t;
+
+#define        DMU_GET_STREAM_HDRTYPE(vi)      BF64_GET((vi), 0, 2)
+#define        DMU_SET_STREAM_HDRTYPE(vi, x)   BF64_SET((vi), 0, 2, x)
+
+#define        DMU_GET_FEATUREFLAGS(vi)        BF64_GET((vi), 2, 30)
+#define        DMU_SET_FEATUREFLAGS(vi, x)     BF64_SET((vi), 2, 30, x)
+
+/*
+ * Feature flags for zfs send streams (flags in drr_versioninfo)
+ */
+
+#define        DMU_BACKUP_FEATURE_DEDUP        (0x1)
+#define        DMU_BACKUP_FEATURE_DEDUPPROPS   (0x2)
+#define        DMU_BACKUP_FEATURE_SA_SPILL     (0x4)
+
+/*
+ * Mask of all supported backup features
+ */
+#define        DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
+               DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+
+/* Are all features in the given flag word currently supported? */
+#define        DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+
+/*
+ * The drr_versioninfo field of the dmu_replay_record has the
+ * following layout:
+ *
+ *     64      56      48      40      32      24      16      8       0
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ *     |               reserved        |        feature-flags      |C|S|
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * The low order two bits indicate the header type: SUBSTREAM (0x1)
+ * or COMPOUNDSTREAM (0x2).  Using two bits for this is historical:
+ * this field used to be a version number, where the two version types
+ * were 1 and 2.  Using two bits for this allows earlier versions of
+ * the code to be able to recognize send streams that don't use any
+ * of the features indicated by feature flags.
+ */
+
 #define        DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
 #define        DRR_FLAG_CLONE          (1<<0)
 #define        DRR_FLAG_CI_DATA        (1<<1)
 
 /*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define        DRR_CHECKSUM_DEDUP      (1<<0)
+
+#define        DRR_IS_DEDUP_CAPABLE(flags)     ((flags) & DRR_CHECKSUM_DEDUP)
+
+/*
  * zfs ioctl command structure
  */
 typedef struct dmu_replay_record {
        enum {
                DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
-               DRR_WRITE, DRR_FREE, DRR_END,
+               DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
+               DRR_SPILL, DRR_NUMTYPES
        } drr_type;
        uint32_t drr_payloadlen;
        union {
                struct drr_begin {
                        uint64_t drr_magic;
-                       uint64_t drr_version;
+                       uint64_t drr_versioninfo; /* was drr_version */
                        uint64_t drr_creation_time;
                        dmu_objset_type_t drr_type;
                        uint32_t drr_flags;
@@ -74,6 +135,7 @@ typedef struct dmu_replay_record {
                } drr_begin;
                struct drr_end {
                        zio_cksum_t drr_checksum;
+                       uint64_t drr_toguid;
                } drr_end;
                struct drr_object {
                        uint64_t drr_object;
@@ -81,14 +143,16 @@ typedef struct dmu_replay_record {
                        dmu_object_type_t drr_bonustype;
                        uint32_t drr_blksz;
                        uint32_t drr_bonuslen;
-                       uint8_t drr_checksum;
+                       uint8_t drr_checksumtype;
                        uint8_t drr_compress;
                        uint8_t drr_pad[6];
+                       uint64_t drr_toguid;
                        /* bonus content follows */
                } drr_object;
                struct drr_freeobjects {
                        uint64_t drr_firstobj;
                        uint64_t drr_numobjs;
+                       uint64_t drr_toguid;
                } drr_freeobjects;
                struct drr_write {
                        uint64_t drr_object;
@@ -96,13 +160,42 @@ typedef struct dmu_replay_record {
                        uint32_t drr_pad;
                        uint64_t drr_offset;
                        uint64_t drr_length;
+                       uint64_t drr_toguid;
+                       uint8_t drr_checksumtype;
+                       uint8_t drr_checksumflags;
+                       uint8_t drr_pad2[6];
+                       ddt_key_t drr_key; /* deduplication key */
                        /* content follows */
                } drr_write;
                struct drr_free {
                        uint64_t drr_object;
                        uint64_t drr_offset;
                        uint64_t drr_length;
+                       uint64_t drr_toguid;
                } drr_free;
+               struct drr_write_byref {
+                       /* where to put the data */
+                       uint64_t drr_object;
+                       uint64_t drr_offset;
+                       uint64_t drr_length;
+                       uint64_t drr_toguid;
+                       /* where to find the prior copy of the data */
+                       uint64_t drr_refguid;
+                       uint64_t drr_refobject;
+                       uint64_t drr_refoffset;
+                       /* properties of the data */
+                       uint8_t drr_checksumtype;
+                       uint8_t drr_checksumflags;
+                       uint8_t drr_pad2[6];
+                       ddt_key_t drr_key; /* deduplication key */
+               } drr_write_byref;
+               struct drr_spill {
+                       uint64_t drr_object;
+                       uint64_t drr_length;
+                       uint64_t drr_toguid;
+                       uint64_t drr_pad[4]; /* needed for crypto */
+                       /* spill data follows */
+               } drr_spill;
        } drr_u;
 } dmu_replay_record_t;
 
@@ -117,6 +210,10 @@ typedef struct zinject_record {
        uint64_t        zi_type;
        uint32_t        zi_freq;
        uint32_t        zi_failfast;
+       char            zi_func[MAXNAMELEN];
+       uint32_t        zi_iotype;
+       int32_t         zi_duration;
+       uint64_t        zi_timer;
 } zinject_record_t;
 
 #define        ZINJECT_NULL            0x1
@@ -146,6 +243,7 @@ typedef struct zfs_cmd {
        char            zc_name[MAXPATHLEN];
        char            zc_value[MAXPATHLEN * 2];
        char            zc_string[MAXNAMELEN];
+       char            zc_top_ds[MAXPATHLEN];
        uint64_t        zc_guid;
        uint64_t        zc_nvlist_conf;         /* really (char *) */
        uint64_t        zc_nvlist_conf_size;
@@ -166,6 +264,7 @@ typedef struct zfs_cmd {
        struct drr_begin zc_begin_record;
        zinject_record_t zc_inject_record;
        boolean_t       zc_defer_destroy;
+       boolean_t       zc_temphold;
 } zfs_cmd_t;
 
 typedef struct zfs_useracct {
@@ -178,6 +277,8 @@ typedef struct zfs_useracct {
 #define        ZVOL_MAX_MINOR  (1 << 16)
 #define        ZFS_MIN_MINOR   (ZVOL_MAX_MINOR + 1)
 
+#define        ZPOOL_EXPORT_AFTER_SPLIT 0x1
+
 #ifdef _KERNEL
 
 typedef struct zfs_creat {
@@ -192,7 +293,7 @@ extern int zfs_secpolicy_rename_perms(const char *from,
     const char *to, cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
-extern int zfs_unmount_snap(char *, void *);
+extern int zfs_unmount_snap(const char *, void *);
 
 #endif /* _KERNEL */
 
diff --git a/module/zfs/include/sys/zfs_sa.h b/module/zfs/include/sys/zfs_sa.h
new file mode 100644 (file)
index 0000000..cd312b2
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_ZFS_SA_H
+#define        _SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL.  The values of the actual
+ * attributes are not defined by the order
+ * the enums.  It is controlled by the attribute
+ * registration mechanism.  Two different file system
+ * could have different numeric values for the same
+ * attributes.  this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+       ZPL_ATIME,
+       ZPL_MTIME,
+       ZPL_CTIME,
+       ZPL_CRTIME,
+       ZPL_GEN,
+       ZPL_MODE,
+       ZPL_SIZE,
+       ZPL_PARENT,
+       ZPL_LINKS,
+       ZPL_XATTR,
+       ZPL_RDEV,
+       ZPL_FLAGS,
+       ZPL_UID,
+       ZPL_GID,
+       ZPL_PAD,
+       ZPL_ZNODE_ACL,
+       ZPL_DACL_COUNT,
+       ZPL_SYMLINK,
+       ZPL_SCANSTAMP,
+       ZPL_DACL_ACES,
+       ZPL_END
+} zpl_attr_t;
+
+#define        ZFS_OLD_ZNODE_PHYS_SIZE 0x108
+#define        ZFS_SA_BASE_ATTR_SIZE   (ZFS_OLD_ZNODE_PHYS_SIZE - \
+    sizeof (zfs_acl_phys_t))
+
+#define        SA_MODE_OFFSET          0
+#define        SA_SIZE_OFFSET          8
+#define        SA_GEN_OFFSET           16
+#define        SA_UID_OFFSET           24
+#define        SA_GID_OFFSET           32
+#define        SA_PARENT_OFFSET        40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+       uint64_t zp_atime[2];           /*  0 - last file access time */
+       uint64_t zp_mtime[2];           /* 16 - last file modification time */
+       uint64_t zp_ctime[2];           /* 32 - last file change time */
+       uint64_t zp_crtime[2];          /* 48 - creation time */
+       uint64_t zp_gen;                /* 64 - generation (txg of creation) */
+       uint64_t zp_mode;               /* 72 - file mode bits */
+       uint64_t zp_size;               /* 80 - size of file */
+       uint64_t zp_parent;             /* 88 - directory parent (`..') */
+       uint64_t zp_links;              /* 96 - number of links to file */
+       uint64_t zp_xattr;              /* 104 - DMU object for xattrs */
+       uint64_t zp_rdev;               /* 112 - dev_t for VBLK & VCHR files */
+       uint64_t zp_flags;              /* 120 - persistent flags */
+       uint64_t zp_uid;                /* 128 - file owner */
+       uint64_t zp_gid;                /* 136 - owning group */
+       uint64_t zp_zap;                /* 144 - extra attributes */
+       uint64_t zp_pad[3];             /* 152 - future */
+       zfs_acl_phys_t zp_acl;          /* 176 - 263 ACL */
+       /*
+        * Data may pad out any remaining bytes in the znode buffer, eg:
+        *
+        * |<---------------------- dnode_phys (512) ------------------------>|
+        * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+        *                      |<---- znode (264) ---->|<---- data (56) ---->|
+        *
+        * At present, we use this space for the following:
+        *  - symbolic links
+        *  - 32-byte anti-virus scanstamp (regular files only)
+        */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle  *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_SA_H */
index 2855523..86dcdac 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_FS_ZFS_VFSOPS_H
@@ -31,6 +30,7 @@
 #include <sys/list.h>
 #include <sys/vfs.h>
 #include <sys/zil.h>
+#include <sys/sa.h>
 #include <sys/rrwlock.h>
 #include <sys/zfs_ioctl.h>
 
@@ -39,6 +39,7 @@ extern "C" {
 #endif
 
 typedef struct zfsvfs zfsvfs_t;
+struct znode;
 
 struct zfsvfs {
        vfs_t           *z_vfs;         /* generic fs struct */
@@ -56,7 +57,6 @@ struct zfsvfs {
        boolean_t       z_fuid_dirty;   /* need to sync fuid table ? */
        struct zfs_fuid_info    *z_fuid_replay; /* fuid info for replay */
        zilog_t         *z_log;         /* intent log pointer */
-       uint_t          z_acl_mode;     /* acl chmod/mode behavior */
        uint_t          z_acl_inherit;  /* acl inheritance behavior */
        zfs_case_t      z_case;         /* case-sense */
        boolean_t       z_utf8;         /* utf8-only */
@@ -73,11 +73,13 @@ struct zfsvfs {
        boolean_t       z_vscan;        /* virus scan on/off */
        boolean_t       z_use_fuids;    /* version allows fuids */
        boolean_t       z_replay;       /* set during ZIL replay */
+       boolean_t       z_use_sa;       /* version allow system attributes */
        uint64_t        z_version;      /* ZPL version */
        uint64_t        z_shares_dir;   /* hidden shares dir */
        kmutex_t        z_lock;
        uint64_t        z_userquota_obj;
        uint64_t        z_groupquota_obj;
+       sa_attr_type_t  *z_attr_table;  /* SA attr mapping->id */
 #define        ZFS_OBJ_MTX_SZ  64
        kmutex_t        z_hold_mtx[ZFS_OBJ_MTX_SZ];     /* znode hold locks */
 };
@@ -132,19 +134,22 @@ typedef struct zfid_long {
 
 extern uint_t zfs_fsyncer_key;
 
-extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
 extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valuep);
 extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
 extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota);
-extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
-    boolean_t isgroup, uint64_t fuid);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+    boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+    uint64_t fuid);
 extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
-extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp);
+extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
 extern void zfsvfs_free(zfsvfs_t *zfsvfs);
+extern int zfs_check_global_label(const char *dsname, const char *hexsl);
 
 #ifdef __cplusplus
 }
index 5db5b8d..4781ee6 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/attr.h>
 #include <sys/list.h>
 #include <sys/dmu.h>
+#include <sys/sa.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
 #endif
 #include <sys/zfs_acl.h>
 #include <sys/zil.h>
@@ -57,13 +59,16 @@ extern "C" {
 #define        ZFS_OPAQUE              0x0000010000000000
 #define        ZFS_AV_QUARANTINED      0x0000020000000000
 #define        ZFS_AV_MODIFIED         0x0000040000000000
+#define        ZFS_REPARSE             0x0000080000000000
 
-#define        ZFS_ATTR_SET(zp, attr, value)   \
+#define        ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
 { \
        if (value) \
-               zp->z_phys->zp_flags |= attr; \
+               pflags |= attr; \
        else \
-               zp->z_phys->zp_flags &= ~attr; \
+               pflags &= ~attr; \
+       VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+           &pflags, sizeof (pflags), tx)); \
 }
 
 /*
@@ -79,6 +84,27 @@ extern "C" {
 #define        ZFS_BONUS_SCANSTAMP     0x80            /* Scanstamp in bonus area */
 #define        ZFS_NO_EXECS_DENIED     0x100           /* exec was given to everyone */
 
+#define        SA_ZPL_ATIME(z)         z->z_attr_table[ZPL_ATIME]
+#define        SA_ZPL_MTIME(z)         z->z_attr_table[ZPL_MTIME]
+#define        SA_ZPL_CTIME(z)         z->z_attr_table[ZPL_CTIME]
+#define        SA_ZPL_CRTIME(z)        z->z_attr_table[ZPL_CRTIME]
+#define        SA_ZPL_GEN(z)           z->z_attr_table[ZPL_GEN]
+#define        SA_ZPL_DACL_ACES(z)     z->z_attr_table[ZPL_DACL_ACES]
+#define        SA_ZPL_XATTR(z)         z->z_attr_table[ZPL_XATTR]
+#define        SA_ZPL_SYMLINK(z)       z->z_attr_table[ZPL_SYMLINK]
+#define        SA_ZPL_RDEV(z)          z->z_attr_table[ZPL_RDEV]
+#define        SA_ZPL_SCANSTAMP(z)     z->z_attr_table[ZPL_SCANSTAMP]
+#define        SA_ZPL_UID(z)           z->z_attr_table[ZPL_UID]
+#define        SA_ZPL_GID(z)           z->z_attr_table[ZPL_GID]
+#define        SA_ZPL_PARENT(z)        z->z_attr_table[ZPL_PARENT]
+#define        SA_ZPL_LINKS(z)         z->z_attr_table[ZPL_LINKS]
+#define        SA_ZPL_MODE(z)          z->z_attr_table[ZPL_MODE]
+#define        SA_ZPL_DACL_COUNT(z)    z->z_attr_table[ZPL_DACL_COUNT]
+#define        SA_ZPL_FLAGS(z)         z->z_attr_table[ZPL_FLAGS]
+#define        SA_ZPL_SIZE(z)          z->z_attr_table[ZPL_SIZE]
+#define        SA_ZPL_ZNODE_ACL(z)     z->z_attr_table[ZPL_ZNODE_ACL]
+#define        SA_ZPL_PAD(z)           z->z_attr_table[ZPL_PAD]
+
 /*
  * Is ID ephemeral?
  */
@@ -87,8 +113,10 @@ extern "C" {
 /*
  * Should we use FUIDs?
  */
-#define        USE_FUIDS(version, os)  (version >= ZPL_VERSION_FUID &&\
+#define        USE_FUIDS(version, os)  (version >= ZPL_VERSION_FUID && \
     spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define        USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+    spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
 
 #define        MASTER_NODE_OBJ 1
 
@@ -103,6 +131,7 @@ extern "C" {
 #define        ZPL_VERSION_STR         "VERSION"
 #define        ZFS_FUID_TABLES         "FUID"
 #define        ZFS_SHARES_DIR          "SHARES"
+#define        ZFS_SA_ATTRS            "SA_ATTRS"
 
 #define        ZFS_MAX_BLOCKSIZE       (SPA_MAXBLOCKSIZE)
 
@@ -131,42 +160,6 @@ extern "C" {
 #define        ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
 
 /*
- * This is the persistent portion of the znode.  It is stored
- * in the "bonus buffer" of the file.  Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
-       uint64_t zp_atime[2];           /*  0 - last file access time */
-       uint64_t zp_mtime[2];           /* 16 - last file modification time */
-       uint64_t zp_ctime[2];           /* 32 - last file change time */
-       uint64_t zp_crtime[2];          /* 48 - creation time */
-       uint64_t zp_gen;                /* 64 - generation (txg of creation) */
-       uint64_t zp_mode;               /* 72 - file mode bits */
-       uint64_t zp_size;               /* 80 - size of file */
-       uint64_t zp_parent;             /* 88 - directory parent (`..') */
-       uint64_t zp_links;              /* 96 - number of links to file */
-       uint64_t zp_xattr;              /* 104 - DMU object for xattrs */
-       uint64_t zp_rdev;               /* 112 - dev_t for VBLK & VCHR files */
-       uint64_t zp_flags;              /* 120 - persistent flags */
-       uint64_t zp_uid;                /* 128 - file owner */
-       uint64_t zp_gid;                /* 136 - owning group */
-       uint64_t zp_zap;                /* 144 - extra attributes */
-       uint64_t zp_pad[3];             /* 152 - future */
-       zfs_acl_phys_t zp_acl;          /* 176 - 263 ACL */
-       /*
-        * Data may pad out any remaining bytes in the znode buffer, eg:
-        *
-        * |<---------------------- dnode_phys (512) ------------------------>|
-        * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
-        *                      |<---- znode (264) ---->|<---- data (56) ---->|
-        *
-        * At present, we use this space for the following:
-        *  - symbolic links
-        *  - 32-byte anti-virus scanstamp (regular files only)
-        */
-} znode_phys_t;
-
-/*
  * Directory entry locks control access to directory entries.
  * They are used to protect creates, deletes, and renames.
  * Each directory znode has a mutex and a list of locked names.
@@ -175,6 +168,7 @@ typedef struct znode_phys {
 typedef struct zfs_dirlock {
        char            *dl_name;       /* directory entry being locked */
        uint32_t        dl_sharecnt;    /* 0 if exclusive, > 0 if shared */
+       uint8_t         dl_namelock;    /* 1 if z_name_lock is NOT held */
        uint16_t        dl_namesize;    /* set if dl_name was allocated */
        kcondvar_t      dl_cv;          /* wait for entry to be unlocked */
        struct znode    *dl_dzp;        /* directory znode */
@@ -198,16 +192,20 @@ typedef struct znode {
        uint_t          z_seq;          /* modification sequence number */
        uint64_t        z_mapcnt;       /* number of pages mapped to file */
        uint64_t        z_last_itx;     /* last ZIL itx on this znode */
-       uint64_t        z_gen;          /* generation (same as zp_gen) */
+       uint64_t        z_gen;          /* generation (cached) */
+       uint64_t        z_size;         /* file size (cached) */
+       uint64_t        z_atime[2];     /* atime (cached) */
+       uint64_t        z_links;        /* file links (cached) */
+       uint64_t        z_pflags;       /* pflags (cached) */
+       uid_t           z_uid;          /* uid mapped (cached) */
+       uid_t           z_gid;          /* gid mapped (cached) */
+       mode_t          z_mode;         /* mode (cached) */
        uint32_t        z_sync_cnt;     /* synchronous open count */
        kmutex_t        z_acl_lock;     /* acl data lock */
        zfs_acl_t       *z_acl_cached;  /* cached acl */
        list_node_t     z_link_node;    /* all znodes in fs link */
-       /*
-        * These are dmu managed fields.
-        */
-       znode_phys_t    *z_phys;        /* pointer to persistent znode */
-       dmu_buf_t       *z_dbuf;        /* buffer containing the z_phys */
+       sa_handle_t     *z_sa_hdl;      /* handle to sa data */
+       boolean_t       z_is_sa;        /* are we native sa? */
 } znode_t;
 
 
@@ -250,7 +248,7 @@ typedef struct znode {
 #define        ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
 
 #define        ZFS_VERIFY_ZP(zp) \
-       if ((zp)->z_dbuf == NULL) { \
+       if ((zp)->z_sa_hdl == NULL) { \
                ZFS_EXIT((zp)->z_zfsvfs); \
                return (EIO); \
        } \
@@ -292,14 +290,14 @@ typedef struct znode {
 
 #define        ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
        if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
-               zfs_time_stamper(zp, ACCESSED, NULL)
+               zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
 
 extern int     zfs_init_fs(zfsvfs_t *, znode_t **);
 extern void    zfs_set_dataprop(objset_t *);
 extern void    zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
     dmu_tx_t *tx);
-extern void    zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void    zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void    zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+    uint64_t [2], boolean_t);
 extern void    zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
 extern int     zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
 extern void    zfs_znode_init(void);
@@ -338,7 +336,7 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
 extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
-extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
 extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 
index 2aff8cd..2f01cf9 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef        _SYS_ZIL_H
 #define        _SYS_ZIL_H
 
@@ -55,34 +56,40 @@ typedef struct zil_header {
        uint64_t zh_claim_txg;  /* txg in which log blocks were claimed */
        uint64_t zh_replay_seq; /* highest replayed sequence number */
        blkptr_t zh_log;        /* log chain */
-       uint64_t zh_claim_seq;  /* highest claimed sequence number */
+       uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
        uint64_t zh_flags;      /* header flags */
-       uint64_t zh_pad[4];
+       uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+       uint64_t zh_pad[3];
 } zil_header_t;
 
 /*
  * zh_flags bit settings
  */
-#define        ZIL_REPLAY_NEEDED 0x1   /* replay needed - internal only */
+#define        ZIL_REPLAY_NEEDED       0x1     /* replay needed - internal only */
+#define        ZIL_CLAIM_LR_SEQ_VALID  0x2     /* zh_claim_lr_seq field is valid */
 
 /*
- * Log block trailer - structure at the end of the header and each log block
+ * Log block chaining.
+ *
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
  *
- * The zit_bt contains a zbt_cksum which for the intent log is
+ * The zio_eck_t contains a zec_cksum which for the intent log is
  * the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
+ * The zec_cksum is checked by the SPA against the sequence
  * number passed in the blk_cksum field of the blkptr_t
  */
-typedef struct zil_trailer {
-       uint64_t zit_pad;
-       blkptr_t zit_next_blk;  /* next block in chain */
-       uint64_t zit_nused;     /* bytes in log block used */
-       zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
+typedef struct zil_chain {
+       uint64_t zc_pad;
+       blkptr_t zc_next_blk;   /* next block in chain */
+       uint64_t zc_nused;      /* bytes in log block used */
+       zio_eck_t zc_eck;       /* block trailer */
+} zil_chain_t;
 
 #define        ZIL_MIN_BLKSZ   4096ULL
 #define        ZIL_MAX_BLKSZ   SPA_MAXBLOCKSIZE
-#define        ZIL_BLK_DATA_SZ(lwb)    ((lwb)->lwb_sz - sizeof (zil_trailer_t))
 
 /*
  * The words of a log block checksum.
@@ -139,7 +146,8 @@ typedef enum zil_create {
 #define        TX_MKDIR_ACL            17      /* mkdir with ACL */
 #define        TX_MKDIR_ATTR           18      /* mkdir with attr */
 #define        TX_MKDIR_ACL_ATTR       19      /* mkdir with ACL + attrs */
-#define        TX_MAX_TYPE             20      /* Max transaction type */
+#define        TX_WRITE2               20      /* dmu_sync EALREADY write */
+#define        TX_MAX_TYPE             21      /* Max transaction type */
 
 /*
  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
@@ -149,6 +157,20 @@ typedef enum zil_create {
 #define        TX_CI   ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
 
 /*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order.  For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define        TX_OOO(txtype)                  \
+       ((txtype) == TX_WRITE ||        \
+       (txtype) == TX_TRUNCATE ||      \
+       (txtype) == TX_SETATTR ||       \
+       (txtype) == TX_ACL_V0 ||        \
+       (txtype) == TX_ACL ||           \
+       (txtype) == TX_WRITE2)
+
+
+/*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
  * and sized the same on sparc & intel architectures.
@@ -168,6 +190,14 @@ typedef struct {                   /* common log record header */
 } lr_t;
 
 /*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+       lr_t            lr_common;      /* common portion of log record */
+       uint64_t        lr_foid;        /* object id */
+} lr_ooo_t;
+
+/*
  * Handle option extended vattr attributes.
  *
  * Whenever new attributes are added the version number
@@ -257,7 +287,7 @@ typedef struct {
        uint64_t        lr_foid;        /* file object to write */
        uint64_t        lr_offset;      /* offset to write to */
        uint64_t        lr_length;      /* user data length to write */
-       uint64_t        lr_blkoff;      /* offset represented by lr_blkptr */
+       uint64_t        lr_blkoff;      /* no longer used */
        blkptr_t        lr_blkptr;      /* spa block pointer for replay */
        /* write data will follow for small writes */
 } lr_write_t;
@@ -332,6 +362,7 @@ typedef enum {
                        /* and put blkptr in log, rather than actual data) */
        WR_COPIED,      /* immediate - data is copied into lr_write_t */
        WR_NEED_COPY,   /* immediate - data needs to be copied if pushed */
+       WR_NUM_STATES   /* number of states */
 } itx_wr_state_t;
 
 typedef struct itx {
@@ -344,26 +375,14 @@ typedef struct itx {
        /* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
 
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
-       zilog_t         *zgd_zilog;     /* zilog */
-       blkptr_t        *zgd_bp;        /* block pointer */
-       struct rl       *zgd_rl;        /* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
     uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
-extern uint64_t        zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
 
 extern void    zil_init(void);
@@ -377,27 +396,33 @@ extern void       zil_close(zilog_t *zilog);
 
 extern void    zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void    zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void    zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t   *zil_itx_create(uint64_t txtype, size_t lrsize);
+extern void    zil_itx_destroy(itx_t *itx);
 extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
 extern void    zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
 
-extern int     zil_vdev_offline(char *osname, void *txarg);
-extern int     zil_claim(char *osname, void *txarg);
-extern int     zil_check_log_chain(char *osname, void *txarg);
+extern int     zil_vdev_offline(const char *osname, void *txarg);
+extern int     zil_claim(const char *osname, void *txarg);
+extern int     zil_check_log_chain(const char *osname, void *txarg);
 extern void    zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void    zil_clean(zilog_t *zilog);
-extern int     zil_is_committed(zilog_t *zilog);
 
 extern int     zil_suspend(zilog_t *zilog);
 extern void    zil_resume(zilog_t *zilog);
 
-extern void    zil_add_block(zilog_t *zilog, blkptr_t *bp);
+extern void    zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int     zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
+extern void    zil_set_sync(zilog_t *zilog, uint64_t syncval);
+
+extern void    zil_set_logbias(zilog_t *zilog, uint64_t slogval);
 
-extern int zil_disable;
+extern int zil_replay_disable;
 
 #ifdef __cplusplus
 }
index 685305f..6560a79 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef        _SYS_ZIL_IMPL_H
 #define        _SYS_ZIL_IMPL_H
 
@@ -43,8 +44,8 @@ typedef struct lwb {
        int             lwb_sz;         /* size of block and buffer */
        char            *lwb_buf;       /* log write buffer */
        zio_t           *lwb_zio;       /* zio for this buffer */
+       dmu_tx_t        *lwb_tx;        /* tx for log block allocation */
        uint64_t        lwb_max_txg;    /* highest txg in this lwb */
-       txg_handle_t    lwb_txgh;       /* txg handle for txg_exit() */
        list_node_t     lwb_node;       /* zilog->zl_lwb_list linkage */
 } lwb_t;
 
@@ -57,6 +58,8 @@ typedef struct zil_vdev_node {
        avl_node_t      zv_node;        /* AVL tree linkage */
 } zil_vdev_node_t;
 
+#define        ZIL_PREV_BLKS 16
+
 /*
  * Stable storage intent log management structure.  One per dataset.
  */
@@ -68,9 +71,10 @@ struct zilog {
        objset_t        *zl_os;         /* object set we're logging */
        zil_get_data_t  *zl_get_data;   /* callback to get object content */
        zio_t           *zl_root_zio;   /* log writer root zio */
-       uint64_t        zl_itx_seq;     /* next itx sequence number */
+       uint64_t        zl_itx_seq;     /* next in-core itx sequence number */
+       uint64_t        zl_lr_seq;      /* on-disk log record sequence number */
        uint64_t        zl_commit_seq;  /* committed upto this number */
-       uint64_t        zl_lr_seq;      /* log record sequence number */
+       uint64_t        zl_commit_lr_seq; /* last committed on-disk lr seq */
        uint64_t        zl_destroy_txg; /* txg of last zil_destroy() */
        uint64_t        zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
        uint64_t        zl_replaying_seq; /* current replay seq number */
@@ -82,7 +86,13 @@ struct zilog {
        uint8_t         zl_replay;      /* replaying records while set */
        uint8_t         zl_stop_sync;   /* for debugging */
        uint8_t         zl_writer;      /* boolean: write setup in progress */
-       uint8_t         zl_log_error;   /* boolean: log write error */
+       uint8_t         zl_logbias;     /* latency or throughput */
+       uint8_t         zl_sync;        /* synchronous or asynchronous */
+       int             zl_parse_error; /* last zil_parse() error */
+       uint64_t        zl_parse_blk_seq; /* highest blk seq on last parse */
+       uint64_t        zl_parse_lr_seq; /* highest lr seq on last parse */
+       uint64_t        zl_parse_blk_count; /* number of blocks parsed */
+       uint64_t        zl_parse_lr_count; /* number of log records parsed */
        list_t          zl_itx_list;    /* in-memory itx list */
        uint64_t        zl_itx_list_sz; /* total size of records on list */
        uint64_t        zl_cur_used;    /* current commit log size used */
@@ -91,17 +101,20 @@ struct zilog {
        kmutex_t        zl_vdev_lock;   /* protects zl_vdev_tree */
        avl_tree_t      zl_vdev_tree;   /* vdevs to flush in zil_commit() */
        taskq_t         *zl_clean_taskq; /* runs lwb and itx clean tasks */
-       avl_tree_t      zl_dva_tree;    /* track DVAs during log parse */
+       avl_tree_t      zl_bp_tree;     /* track bps during log parse */
        clock_t         zl_replay_time; /* lbolt of when replay started */
        uint64_t        zl_replay_blks; /* number of log blocks replayed */
+       zil_header_t    zl_old_header;  /* debugging aid */
+       uint_t          zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+       uint_t          zl_prev_rotor;  /* rotor for zl_prev[] */
 };
 
-typedef struct zil_dva_node {
+typedef struct zil_bp_node {
        dva_t           zn_dva;
        avl_node_t      zn_node;
-} zil_dva_node_t;
+} zil_bp_node_t;
 
-#define        ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+#define        ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 #ifdef __cplusplus
index e47d8f4..0400c17 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _ZIO_H
 extern "C" {
 #endif
 
-#define        ZBT_MAGIC       0x210da7ab10c7a11ULL    /* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define        ZEC_MAGIC       0x210da7ab10c7a11ULL
 
-typedef struct zio_block_tail {
-       uint64_t        zbt_magic;      /* for validation, endianness   */
-       zio_cksum_t     zbt_cksum;      /* 256-bit checksum             */
-} zio_block_tail_t;
+typedef struct zio_eck {
+       uint64_t        zec_magic;      /* for validation, endianness   */
+       zio_cksum_t     zec_cksum;      /* 256-bit checksum             */
+} zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
@@ -51,16 +53,16 @@ typedef struct zio_block_tail {
  */
 #define        SPA_GANGBLOCKSIZE       SPA_MINBLOCKSIZE
 #define        SPA_GBH_NBLKPTRS        ((SPA_GANGBLOCKSIZE - \
-       sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+       sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define        SPA_GBH_FILLER          ((SPA_GANGBLOCKSIZE - \
-       sizeof (zio_block_tail_t) - \
+       sizeof (zio_eck_t) - \
        (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
        sizeof (uint64_t))
 
 typedef struct zio_gbh {
        blkptr_t                zg_blkptr[SPA_GBH_NBLKPTRS];
        uint64_t                zg_filler[SPA_GBH_FILLER];
-       zio_block_tail_t        zg_tail;
+       zio_eck_t               zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
@@ -73,12 +75,19 @@ enum zio_checksum {
        ZIO_CHECKSUM_FLETCHER_2,
        ZIO_CHECKSUM_FLETCHER_4,
        ZIO_CHECKSUM_SHA256,
+       ZIO_CHECKSUM_ZILOG2,
        ZIO_CHECKSUM_FUNCTIONS
 };
 
 #define        ZIO_CHECKSUM_ON_VALUE   ZIO_CHECKSUM_FLETCHER_4
 #define        ZIO_CHECKSUM_DEFAULT    ZIO_CHECKSUM_ON
 
+#define        ZIO_CHECKSUM_MASK       0xffULL
+#define        ZIO_CHECKSUM_VERIFY     (1 << 8)
+
+#define        ZIO_DEDUPCHECKSUM       ZIO_CHECKSUM_SHA256
+#define        ZIO_DEDUPDITTO_MIN      100
+
 enum zio_compress {
        ZIO_COMPRESS_INHERIT = 0,
        ZIO_COMPRESS_ON,
@@ -94,12 +103,19 @@ enum zio_compress {
        ZIO_COMPRESS_GZIP_7,
        ZIO_COMPRESS_GZIP_8,
        ZIO_COMPRESS_GZIP_9,
+       ZIO_COMPRESS_ZLE,
        ZIO_COMPRESS_FUNCTIONS
 };
 
 #define        ZIO_COMPRESS_ON_VALUE   ZIO_COMPRESS_LZJB
 #define        ZIO_COMPRESS_DEFAULT    ZIO_COMPRESS_OFF
 
+#define        BOOTFS_COMPRESS_VALID(compress)                 \
+       ((compress) == ZIO_COMPRESS_LZJB ||             \
+       ((compress) == ZIO_COMPRESS_ON &&               \
+       ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) ||  \
+       (compress) == ZIO_COMPRESS_OFF)
+
 #define        ZIO_FAILURE_MODE_WAIT           0
 #define        ZIO_FAILURE_MODE_CONTINUE       1
 #define        ZIO_FAILURE_MODE_PANIC          2
@@ -107,84 +123,89 @@ enum zio_compress {
 #define        ZIO_PRIORITY_NOW                (zio_priority_table[0])
 #define        ZIO_PRIORITY_SYNC_READ          (zio_priority_table[1])
 #define        ZIO_PRIORITY_SYNC_WRITE         (zio_priority_table[2])
-#define        ZIO_PRIORITY_ASYNC_READ         (zio_priority_table[3])
-#define        ZIO_PRIORITY_ASYNC_WRITE        (zio_priority_table[4])
-#define        ZIO_PRIORITY_FREE               (zio_priority_table[5])
-#define        ZIO_PRIORITY_CACHE_FILL         (zio_priority_table[6])
-#define        ZIO_PRIORITY_LOG_WRITE          (zio_priority_table[7])
-#define        ZIO_PRIORITY_RESILVER           (zio_priority_table[8])
-#define        ZIO_PRIORITY_SCRUB              (zio_priority_table[9])
-#define        ZIO_PRIORITY_TABLE_SIZE         10
-
-#define        ZIO_FLAG_MUSTSUCCEED            0x000000
-#define        ZIO_FLAG_CANFAIL                0x000001
-#define        ZIO_FLAG_SPECULATIVE            0x000002
-#define        ZIO_FLAG_CONFIG_WRITER          0x000004
-#define        ZIO_FLAG_DONT_RETRY             0x000008
-
-#define        ZIO_FLAG_DONT_CACHE             0x000010
-#define        ZIO_FLAG_DONT_QUEUE             0x000020
-#define        ZIO_FLAG_DONT_AGGREGATE         0x000040
-#define        ZIO_FLAG_DONT_PROPAGATE         0x000080
-
-#define        ZIO_FLAG_IO_BYPASS              0x000100
-#define        ZIO_FLAG_IO_REPAIR              0x000200
-#define        ZIO_FLAG_IO_RETRY               0x000400
-#define        ZIO_FLAG_IO_REWRITE             0x000800
-
-#define        ZIO_FLAG_SELF_HEAL              0x001000
-#define        ZIO_FLAG_RESILVER               0x002000
-#define        ZIO_FLAG_SCRUB                  0x004000
-#define        ZIO_FLAG_SCRUB_THREAD           0x008000
-
-#define        ZIO_FLAG_PROBE                  0x010000
-#define        ZIO_FLAG_GANG_CHILD             0x020000
-#define        ZIO_FLAG_RAW                    0x040000
-#define        ZIO_FLAG_GODFATHER              0x080000
-
-#define        ZIO_FLAG_TRYHARD                0x100000
-#define        ZIO_FLAG_NODATA                 0x200000
-#define        ZIO_FLAG_OPTIONAL               0x400000
-
-#define        ZIO_FLAG_GANG_INHERIT           \
-       (ZIO_FLAG_CANFAIL |             \
-       ZIO_FLAG_SPECULATIVE |          \
-       ZIO_FLAG_CONFIG_WRITER |        \
-       ZIO_FLAG_DONT_RETRY |           \
-       ZIO_FLAG_DONT_CACHE |           \
-       ZIO_FLAG_DONT_AGGREGATE |       \
-       ZIO_FLAG_SELF_HEAL |            \
-       ZIO_FLAG_RESILVER |             \
-       ZIO_FLAG_SCRUB |                \
-       ZIO_FLAG_SCRUB_THREAD)
-
-#define        ZIO_FLAG_VDEV_INHERIT           \
-       (ZIO_FLAG_GANG_INHERIT |        \
-       ZIO_FLAG_IO_REPAIR |            \
-       ZIO_FLAG_IO_RETRY |             \
-       ZIO_FLAG_PROBE |                \
-       ZIO_FLAG_TRYHARD |              \
-       ZIO_FLAG_NODATA |               \
-       ZIO_FLAG_OPTIONAL)
-
-#define        ZIO_FLAG_AGG_INHERIT            \
-       (ZIO_FLAG_DONT_AGGREGATE |      \
-       ZIO_FLAG_IO_REPAIR |            \
-       ZIO_FLAG_SELF_HEAL |            \
-       ZIO_FLAG_RESILVER |             \
-       ZIO_FLAG_SCRUB |                \
-       ZIO_FLAG_SCRUB_THREAD)
+#define        ZIO_PRIORITY_LOG_WRITE          (zio_priority_table[3])
+#define        ZIO_PRIORITY_CACHE_FILL         (zio_priority_table[4])
+#define        ZIO_PRIORITY_AGG                (zio_priority_table[5])
+#define        ZIO_PRIORITY_FREE               (zio_priority_table[6])
+#define        ZIO_PRIORITY_ASYNC_WRITE        (zio_priority_table[7])
+#define        ZIO_PRIORITY_ASYNC_READ         (zio_priority_table[8])
+#define        ZIO_PRIORITY_RESILVER           (zio_priority_table[9])
+#define        ZIO_PRIORITY_SCRUB              (zio_priority_table[10])
+#define        ZIO_PRIORITY_DDT_PREFETCH       (zio_priority_table[11])
+#define        ZIO_PRIORITY_TABLE_SIZE         12
 
 #define        ZIO_PIPELINE_CONTINUE           0x100
 #define        ZIO_PIPELINE_STOP               0x101
 
+enum zio_flag {
+       /*
+        * Flags inherited by gang, ddt, and vdev children,
+        * and that must be equal for two zios to aggregate
+        */
+       ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
+       ZIO_FLAG_IO_REPAIR      = 1 << 1,
+       ZIO_FLAG_SELF_HEAL      = 1 << 2,
+       ZIO_FLAG_RESILVER       = 1 << 3,
+       ZIO_FLAG_SCRUB          = 1 << 4,
+       ZIO_FLAG_SCRUB_THREAD   = 1 << 5,
+
+#define        ZIO_FLAG_AGG_INHERIT    (ZIO_FLAG_CANFAIL - 1)
+
+       /*
+        * Flags inherited by ddt, gang, and vdev children.
+        */
+       ZIO_FLAG_CANFAIL        = 1 << 6,       /* must be first for INHERIT */
+       ZIO_FLAG_SPECULATIVE    = 1 << 7,
+       ZIO_FLAG_CONFIG_WRITER  = 1 << 8,
+       ZIO_FLAG_DONT_RETRY     = 1 << 9,
+       ZIO_FLAG_DONT_CACHE     = 1 << 10,
+       ZIO_FLAG_NODATA         = 1 << 11,
+       ZIO_FLAG_INDUCE_DAMAGE  = 1 << 12,
+
+#define        ZIO_FLAG_DDT_INHERIT    (ZIO_FLAG_IO_RETRY - 1)
+#define        ZIO_FLAG_GANG_INHERIT   (ZIO_FLAG_IO_RETRY - 1)
+
+       /*
+        * Flags inherited by vdev children.
+        */
+       ZIO_FLAG_IO_RETRY       = 1 << 13,      /* must be first for INHERIT */
+       ZIO_FLAG_PROBE          = 1 << 14,
+       ZIO_FLAG_TRYHARD        = 1 << 15,
+       ZIO_FLAG_OPTIONAL       = 1 << 16,
+
+#define        ZIO_FLAG_VDEV_INHERIT   (ZIO_FLAG_DONT_QUEUE - 1)
+
+       /*
+        * Flags not inherited by any children.
+        */
+       ZIO_FLAG_DONT_QUEUE     = 1 << 17,      /* must be first for INHERIT */
+       ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
+       ZIO_FLAG_IO_BYPASS      = 1 << 19,
+       ZIO_FLAG_IO_REWRITE     = 1 << 20,
+       ZIO_FLAG_RAW            = 1 << 21,
+       ZIO_FLAG_GANG_CHILD     = 1 << 22,
+       ZIO_FLAG_DDT_CHILD      = 1 << 23,
+       ZIO_FLAG_GODFATHER      = 1 << 24
+};
+
+#define        ZIO_FLAG_MUSTSUCCEED            0
+
+#define        ZIO_DDT_CHILD_FLAGS(zio)                                \
+       (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |             \
+       ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
 #define        ZIO_GANG_CHILD_FLAGS(zio)                               \
        (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |            \
        ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
+#define        ZIO_VDEV_CHILD_FLAGS(zio)                               \
+       (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |            \
+       ZIO_FLAG_CANFAIL)
+
 enum zio_child {
        ZIO_CHILD_VDEV = 0,
        ZIO_CHILD_GANG,
+       ZIO_CHILD_DDT,
        ZIO_CHILD_LOGICAL,
        ZIO_CHILD_TYPES
 };
@@ -202,7 +223,6 @@ enum zio_wait_type {
 #define        ECKSUM  EBADE
 #define        EFRAGS  EBADR
 
-typedef struct zio zio_t;
 typedef void zio_done_func_t(zio_t *zio);
 
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
@@ -211,18 +231,15 @@ extern char *zio_type_name[ZIO_TYPES];
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number.  In summary:
+ * is objset 0, and the meta-dnode is object 0.  This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
  *
- *     mos is objset 0
- *     meta-dnode is object 0
- *     root block is <objset, 0, -1, 0>
- *     intent log is <objset, 0, -1, ZIL sequence number>
+ * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  *
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse.  The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel.
  * Therefore it must not change size or alignment between 32/64 bit
@@ -235,14 +252,66 @@ typedef struct zbookmark {
        uint64_t        zb_blkid;
 } zbookmark_t;
 
+#define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+       (zb)->zb_objset = objset;                       \
+       (zb)->zb_object = object;                       \
+       (zb)->zb_level = level;                         \
+       (zb)->zb_blkid = blkid;                         \
+}
+
+#define        ZB_DESTROYED_OBJSET     (-1ULL)
+
+#define        ZB_ROOT_OBJECT          (0ULL)
+#define        ZB_ROOT_LEVEL           (-1LL)
+#define        ZB_ROOT_BLKID           (0ULL)
+
+#define        ZB_ZIL_OBJECT           (0ULL)
+#define        ZB_ZIL_LEVEL            (-2LL)
+
 typedef struct zio_prop {
        enum zio_checksum       zp_checksum;
        enum zio_compress       zp_compress;
        dmu_object_type_t       zp_type;
        uint8_t                 zp_level;
-       uint8_t                 zp_ndvas;
+       uint8_t                 zp_copies;
+       uint8_t                 zp_dedup;
+       uint8_t                 zp_dedup_verify;
 } zio_prop_t;
 
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+    const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum;                          /* defined in zio_checksum.h */
+
+struct zio_cksum_report {
+       struct zio_cksum_report *zcr_next;
+       nvlist_t                *zcr_ereport;
+       nvlist_t                *zcr_detector;
+       void                    *zcr_cbdata;
+       size_t                  zcr_cbinfo;     /* passed to zcr_free() */
+       uint64_t                zcr_align;
+       uint64_t                zcr_length;
+       zio_cksum_finish_f      *zcr_finish;
+       zio_cksum_free_f        *zcr_free;
+
+       /* internal use only */
+       struct zio_bad_cksum    *zcr_ckinfo;    /* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+    void *arg);
+
+zio_vsd_cksum_report_f zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+       zio_done_func_t         *vsd_free;
+       zio_vsd_cksum_report_f  *vsd_cksum_report;
+} zio_vsd_ops_t;
+
 typedef struct zio_gang_node {
        zio_gbh_phys_t          *gn_gbh;
        struct zio_gang_node    *gn_child[SPA_GBH_NBLKPTRS];
@@ -293,6 +362,7 @@ struct zio {
        uint64_t        io_txg;
        spa_t           *io_spa;
        blkptr_t        *io_bp;
+       blkptr_t        *io_bp_override;
        blkptr_t        io_bp_copy;
        list_t          io_parent_list;
        list_t          io_child_list;
@@ -304,16 +374,20 @@ struct zio {
        zio_done_func_t *io_ready;
        zio_done_func_t *io_done;
        void            *io_private;
+       int64_t         io_prev_space_delta;    /* DMU private */
        blkptr_t        io_bp_orig;
 
        /* Data represented by this I/O */
        void            *io_data;
+       void            *io_orig_data;
        uint64_t        io_size;
+       uint64_t        io_orig_size;
 
        /* Stuff for the vdev stack */
        vdev_t          *io_vd;
        void            *io_vsd;
-       zio_done_func_t *io_vsd_free;
+       const zio_vsd_ops_t *io_vsd_ops;
+
        uint64_t        io_offset;
        uint64_t        io_deadline;
        avl_node_t      io_offset_node;
@@ -321,15 +395,17 @@ struct zio {
        avl_tree_t      *io_vdev_tree;
 
        /* Internal pipeline state */
-       int             io_flags;
-       zio_stage_t     io_stage;
-       uint32_t        io_pipeline;
-       int             io_orig_flags;
-       zio_stage_t     io_orig_stage;
-       uint32_t        io_orig_pipeline;
+       enum zio_flag   io_flags;
+       enum zio_stage  io_stage;
+       enum zio_stage  io_pipeline;
+       enum zio_flag   io_orig_flags;
+       enum zio_stage  io_orig_stage;
+       enum zio_stage  io_orig_pipeline;
        int             io_error;
        int             io_child_error[ZIO_CHILD_TYPES];
        uint64_t        io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+       uint64_t        io_child_count;
+       uint64_t        io_parent_count;
        uint64_t        *io_stall;
        zio_t           *io_gang_leader;
        zio_gang_node_t *io_gang_tree;
@@ -339,53 +415,58 @@ struct zio {
        kcondvar_t      io_cv;
 
        /* FMA state */
+       zio_cksum_report_t *io_cksum_report;
        uint64_t        io_ena;
 };
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
-    zio_done_func_t *done, void *private, int flags);
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_root(spa_t *spa,
-    zio_done_func_t *done, void *private, int flags);
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb);
+    int priority, enum zio_flag flags, const zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_prop_t *zp,
+    void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb);
+    int priority, enum zio_flag flags, const zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb);
+    int priority, enum zio_flag flags, zbookmark_t *zb);
 
-extern void zio_skip_write(zio_t *zio);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
 
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags);
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags);
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags,
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
     boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags,
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
     boolean_t labels);
 
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t txg);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
@@ -406,11 +487,11 @@ extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
+    enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
+    enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
@@ -419,8 +500,12 @@ extern void zio_vdev_io_redone(zio_t *zio);
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+    enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+    enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+    enum zio_compress parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
 extern int zio_resume(spa_t *spa);
@@ -442,9 +527,30 @@ extern int zio_inject_fault(char *name, int flags, int *id,
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+    uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+    const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length,
+    const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
 
 #ifdef __cplusplus
 }
index da40739..0956c04 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
@@ -43,28 +42,31 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
 typedef struct zio_checksum_info {
        zio_checksum_t  *ci_func[2]; /* checksum function for each byteorder */
        int             ci_correctable; /* number of correctable bits   */
-       int             ci_zbt;         /* uses zio block tail? */
+       int             ci_eck;         /* uses zio embedded checksum? */
+       int             ci_dedup;       /* strong enough for dedup? */
        char            *ci_name;       /* descriptive name */
 } zio_checksum_info_t;
 
+typedef struct zio_bad_cksum {
+       zio_cksum_t             zbc_expected;
+       zio_cksum_t             zbc_actual;
+       const char              *zbc_checksum_name;
+       uint8_t                 zbc_byteswapped;
+       uint8_t                 zbc_injected;
+       uint8_t                 zbc_has_cksum;  /* expected/actual valid */
+} zio_bad_cksum_t;
+
 extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
 
 /*
  * Checksum routines.
  */
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
 extern zio_checksum_t zio_checksum_SHA256;
 
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
-extern int zio_checksum_error(zio_t *zio);
+extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 
 #ifdef __cplusplus
 }
index 66ee8d4..30bed1a 100644 (file)
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ZIO_COMPRESS_H
 #define        _SYS_ZIO_COMPRESS_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zio.h>
 
 #ifdef __cplusplus
@@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
 extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
 
 /*
  * Compress and decompress data if necessary.
  */
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
-    void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-    void *dest, uint64_t destsize);
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len);
 
 #ifdef __cplusplus
 }
index e7503b7..d90bd8b 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -34,104 +34,136 @@ extern "C" {
 #endif
 
 /*
- * I/O Groups: pipeline stage definitions.
+ * zio pipeline stage definitions
  */
-typedef enum zio_stage {
-       ZIO_STAGE_OPEN = 0,                     /* RWFCI */
+enum zio_stage {
+       ZIO_STAGE_OPEN                  = 1 << 0,       /* RWFCI */
 
-       ZIO_STAGE_ISSUE_ASYNC,                  /* -W--- */
+       ZIO_STAGE_READ_BP_INIT          = 1 << 1,       /* R---- */
+       ZIO_STAGE_FREE_BP_INIT          = 1 << 2,       /* --F-- */
+       ZIO_STAGE_ISSUE_ASYNC           = 1 << 3,       /* RWF-- */
+       ZIO_STAGE_WRITE_BP_INIT         = 1 << 4,       /* -W--- */
 
-       ZIO_STAGE_READ_BP_INIT,                 /* R---- */
-       ZIO_STAGE_WRITE_BP_INIT,                /* -W--- */
+       ZIO_STAGE_CHECKSUM_GENERATE     = 1 << 5,       /* -W--- */
 
-       ZIO_STAGE_CHECKSUM_GENERATE,            /* -W--- */
+       ZIO_STAGE_DDT_READ_START        = 1 << 6,       /* R---- */
+       ZIO_STAGE_DDT_READ_DONE         = 1 << 7,       /* R---- */
+       ZIO_STAGE_DDT_WRITE             = 1 << 8,       /* -W--- */
+       ZIO_STAGE_DDT_FREE              = 1 << 9,       /* --F-- */
 
-       ZIO_STAGE_GANG_ASSEMBLE,                /* RWFC- */
-       ZIO_STAGE_GANG_ISSUE,                   /* RWFC- */
+       ZIO_STAGE_GANG_ASSEMBLE         = 1 << 10,      /* RWFC- */
+       ZIO_STAGE_GANG_ISSUE            = 1 << 11,      /* RWFC- */
 
-       ZIO_STAGE_DVA_ALLOCATE,                 /* -W--- */
-       ZIO_STAGE_DVA_FREE,                     /* --F-- */
-       ZIO_STAGE_DVA_CLAIM,                    /* ---C- */
+       ZIO_STAGE_DVA_ALLOCATE          = 1 << 12,      /* -W--- */
+       ZIO_STAGE_DVA_FREE              = 1 << 13,      /* --F-- */
+       ZIO_STAGE_DVA_CLAIM             = 1 << 14,      /* ---C- */
 
-       ZIO_STAGE_READY,                        /* RWFCI */
+       ZIO_STAGE_READY                 = 1 << 15,      /* RWFCI */
 
-       ZIO_STAGE_VDEV_IO_START,                /* RW--I */
-       ZIO_STAGE_VDEV_IO_DONE,                 /* RW--I */
-       ZIO_STAGE_VDEV_IO_ASSESS,               /* RW--I */
+       ZIO_STAGE_VDEV_IO_START         = 1 << 16,      /* RW--I */
+       ZIO_STAGE_VDEV_IO_DONE          = 1 << 17,      /* RW--I */
+       ZIO_STAGE_VDEV_IO_ASSESS        = 1 << 18,      /* RW--I */
 
-       ZIO_STAGE_CHECKSUM_VERIFY,              /* R---- */
+       ZIO_STAGE_CHECKSUM_VERIFY       = 1 << 19,      /* R---- */
 
-       ZIO_STAGE_DONE,                         /* RWFCI */
-       ZIO_STAGES
-} zio_stage_t;
+       ZIO_STAGE_DONE                  = 1 << 20       /* RWFCI */
+};
 
-#define        ZIO_INTERLOCK_STAGES                                    \
-       ((1U << ZIO_STAGE_READY) |                              \
-       (1U << ZIO_STAGE_DONE))
+#define        ZIO_INTERLOCK_STAGES                    \
+       (ZIO_STAGE_READY |                      \
+       ZIO_STAGE_DONE)
 
-#define        ZIO_INTERLOCK_PIPELINE                                  \
+#define        ZIO_INTERLOCK_PIPELINE                  \
        ZIO_INTERLOCK_STAGES
 
-#define        ZIO_VDEV_IO_STAGES                                      \
-       ((1U << ZIO_STAGE_VDEV_IO_START) |                      \
-       (1U << ZIO_STAGE_VDEV_IO_DONE) |                        \
-       (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define        ZIO_VDEV_IO_STAGES                      \
+       (ZIO_STAGE_VDEV_IO_START |              \
+       ZIO_STAGE_VDEV_IO_DONE |                \
+       ZIO_STAGE_VDEV_IO_ASSESS)
 
-#define        ZIO_VDEV_CHILD_PIPELINE                                 \
-       (ZIO_VDEV_IO_STAGES |                                   \
-       (1U << ZIO_STAGE_DONE))
+#define        ZIO_VDEV_CHILD_PIPELINE                 \
+       (ZIO_VDEV_IO_STAGES |                   \
+       ZIO_STAGE_DONE)
 
-#define        ZIO_READ_COMMON_STAGES                                  \
-       (ZIO_INTERLOCK_STAGES |                                 \
-       ZIO_VDEV_IO_STAGES |                                    \
-       (1U << ZIO_STAGE_CHECKSUM_VERIFY))
+#define        ZIO_READ_COMMON_STAGES                  \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_VDEV_IO_STAGES |                    \
+       ZIO_STAGE_CHECKSUM_VERIFY)
 
-#define        ZIO_READ_PHYS_PIPELINE                                  \
+#define        ZIO_READ_PHYS_PIPELINE                  \
        ZIO_READ_COMMON_STAGES
 
-#define        ZIO_READ_PIPELINE                                       \
-       (ZIO_READ_COMMON_STAGES |                               \
-       (1U << ZIO_STAGE_READ_BP_INIT))
+#define        ZIO_READ_PIPELINE                       \
+       (ZIO_READ_COMMON_STAGES |               \
+       ZIO_STAGE_READ_BP_INIT)
 
-#define        ZIO_WRITE_COMMON_STAGES                                 \
-       (ZIO_INTERLOCK_STAGES |                                 \
-       ZIO_VDEV_IO_STAGES |                                    \
-       (1U << ZIO_STAGE_ISSUE_ASYNC) |                         \
-       (1U << ZIO_STAGE_CHECKSUM_GENERATE))
-
-#define        ZIO_WRITE_PHYS_PIPELINE                                 \
-       ZIO_WRITE_COMMON_STAGES
-
-#define        ZIO_REWRITE_PIPELINE                                    \
-       (ZIO_WRITE_COMMON_STAGES |                              \
-       (1U << ZIO_STAGE_WRITE_BP_INIT))
-
-#define        ZIO_WRITE_PIPELINE                                      \
-       (ZIO_WRITE_COMMON_STAGES |                              \
-       (1U << ZIO_STAGE_WRITE_BP_INIT) |                       \
-       (1U << ZIO_STAGE_DVA_ALLOCATE))
-
-#define        ZIO_GANG_STAGES                                         \
-       ((1U << ZIO_STAGE_GANG_ASSEMBLE) |                      \
-       (1U << ZIO_STAGE_GANG_ISSUE))
+#define        ZIO_DDT_CHILD_READ_PIPELINE             \
+       ZIO_READ_COMMON_STAGES
 
-#define        ZIO_FREE_PIPELINE                                       \
-       (ZIO_INTERLOCK_STAGES |                                 \
-       (1U << ZIO_STAGE_DVA_FREE))
+#define        ZIO_DDT_READ_PIPELINE                   \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_READ_BP_INIT |                \
+       ZIO_STAGE_DDT_READ_START |              \
+       ZIO_STAGE_DDT_READ_DONE)
 
-#define        ZIO_CLAIM_PIPELINE                                      \
-       (ZIO_INTERLOCK_STAGES |                                 \
-       (1U << ZIO_STAGE_DVA_CLAIM))
+#define        ZIO_WRITE_COMMON_STAGES                 \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_VDEV_IO_STAGES |                    \
+       ZIO_STAGE_ISSUE_ASYNC |                 \
+       ZIO_STAGE_CHECKSUM_GENERATE)
 
-#define        ZIO_IOCTL_PIPELINE                                      \
-       (ZIO_INTERLOCK_STAGES |                                 \
-       (1U << ZIO_STAGE_VDEV_IO_START) |                       \
-       (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define        ZIO_WRITE_PHYS_PIPELINE                 \
+       ZIO_WRITE_COMMON_STAGES
 
-#define        ZIO_CONFIG_LOCK_BLOCKING_STAGES                         \
-       ((1U << ZIO_STAGE_VDEV_IO_START) |                      \
-       (1U << ZIO_STAGE_DVA_ALLOCATE) |                        \
-       (1U << ZIO_STAGE_DVA_CLAIM))
+#define        ZIO_REWRITE_PIPELINE                    \
+       (ZIO_WRITE_COMMON_STAGES |              \
+       ZIO_STAGE_WRITE_BP_INIT)
+
+#define        ZIO_WRITE_PIPELINE                      \
+       (ZIO_WRITE_COMMON_STAGES |              \
+       ZIO_STAGE_WRITE_BP_INIT |               \
+       ZIO_STAGE_DVA_ALLOCATE)
+
+#define        ZIO_DDT_CHILD_WRITE_PIPELINE            \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_VDEV_IO_STAGES |                    \
+       ZIO_STAGE_DVA_ALLOCATE)
+
+#define        ZIO_DDT_WRITE_PIPELINE                  \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_ISSUE_ASYNC |                 \
+       ZIO_STAGE_WRITE_BP_INIT |               \
+       ZIO_STAGE_CHECKSUM_GENERATE |           \
+       ZIO_STAGE_DDT_WRITE)
+
+#define        ZIO_GANG_STAGES                         \
+       (ZIO_STAGE_GANG_ASSEMBLE |              \
+       ZIO_STAGE_GANG_ISSUE)
+
+#define        ZIO_FREE_PIPELINE                       \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_FREE_BP_INIT |                \
+       ZIO_STAGE_DVA_FREE)
+
+#define        ZIO_DDT_FREE_PIPELINE                   \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_FREE_BP_INIT |                \
+       ZIO_STAGE_ISSUE_ASYNC |                 \
+       ZIO_STAGE_DDT_FREE)
+
+#define        ZIO_CLAIM_PIPELINE                      \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_DVA_CLAIM)
+
+#define        ZIO_IOCTL_PIPELINE                      \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_VDEV_IO_START |               \
+       ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define        ZIO_BLOCKING_STAGES                     \
+       (ZIO_STAGE_DVA_ALLOCATE |               \
+       ZIO_STAGE_DVA_CLAIM |                   \
+       ZIO_STAGE_VDEV_IO_START)
 
 extern void zio_inject_init(void);
 extern void zio_inject_fini(void);
index 06adc66..0059bf5 100644 (file)
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_ZVOL_H
 #define        _SYS_ZVOL_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef __cplusplus
@@ -43,10 +40,10 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
 extern int zvol_check_volblocksize(uint64_t volblocksize);
 extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
 extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, major_t);
+extern int zvol_create_minor(const char *);
 extern int zvol_remove_minor(const char *);
+extern void zvol_remove_minors(const char *);
 extern int zvol_set_volsize(const char *, major_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
 
 extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
 extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
@@ -61,6 +58,15 @@ extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
 extern int zvol_busy(void);
 extern void zvol_init(void);
 extern void zvol_fini(void);
+
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+    void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+    ssize_t resid, boolean_t sync);
+
 #endif
 
 #ifdef __cplusplus
index 7fcde84..10952f4 100644 (file)
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 /*
  * We keep our own copy of this algorithm for 2 main reasons:
- *     1. If we didn't, anyone modifying common/os/compress.c would
+ *     1. If we didn't, anyone modifying common/os/compress.c would
  *         directly break our on disk format
- *     2. Our version of lzjb does not have a number of checks that the
+ *     2. Our version of lzjb does not have a number of checks that the
  *         common/os version needs and uses
+ *     3. We initialize the lempel to ensure deterministic results,
+ *        so that identical blocks can always be deduplicated.
  * In particular, we are adding the "feature" that compress() can
  * take a destination buffer size and return -1 if the data will not
  * compress to d_len or less.
@@ -43,7 +43,7 @@
 #define        MATCH_MIN       3
 #define        MATCH_MAX       ((1 << MATCH_BITS) + (MATCH_MIN - 1))
 #define        OFFSET_MASK     ((1 << (16 - MATCH_BITS)) - 1)
-#define        LEMPEL_SIZE     256
+#define        LEMPEL_SIZE     1024
 
 /*ARGSUSED*/
 size_t
@@ -53,20 +53,14 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
        uchar_t *dst = d_start;
        uchar_t *cpy, *copymap;
        int copymask = 1 << (NBBY - 1);
-       int mlen, offset;
+       int mlen, offset, hash;
        uint16_t *hp;
-       uint16_t lempel[LEMPEL_SIZE];   /* uninitialized; see above */
+       uint16_t lempel[LEMPEL_SIZE] = { 0 };
 
        while (src < (uchar_t *)s_start + s_len) {
                if ((copymask <<= 1) == (1 << NBBY)) {
-                       if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
-                               if (d_len != s_len)
-                                       return (s_len);
-                               mlen = s_len;
-                               for (src = s_start, dst = d_start; mlen; mlen--)
-                                       *dst++ = *src++;
+                       if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
                                return (s_len);
-                       }
                        copymask = 1;
                        copymap = dst;
                        *dst++ = 0;
@@ -75,8 +69,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
                        *dst++ = *src++;
                        continue;
                }
-               hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
-                   (LEMPEL_SIZE - 1)];
+               hash = (src[0] << 16) + (src[1] << 8) + src[2];
+               hash += hash >> 9;
+               hash += hash >> 5;
+               hp = &lempel[hash & (LEMPEL_SIZE - 1)];
                offset = (intptr_t)(src - *hp) & OFFSET_MASK;
                *hp = (uint16_t)(uintptr_t)src;
                cpy = src - offset;
index 77556ac..17b4b12 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
@@ -36,8 +34,13 @@ uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;    /* force gang blocks */
 
 /*
+ * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ */
+static int metaslab_debug = 0;
+
+/*
  * Minimum size which forces the dynamic allocator to change
- * it's allocation strategy. Once the space map cannot satisfy
+ * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
@@ -49,7 +52,23 @@ uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  * Once the space_map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
-int metaslab_df_free_pct = 30;
+int metaslab_df_free_pct = 4;
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+
+/*
+ * Max number of space_maps to prefetch.
+ */
+int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ */
+int metaslab_smo_bonus_pct = 150;
 
 /*
  * ==========================================================================
@@ -57,12 +76,13 @@ int metaslab_df_free_pct = 30;
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 {
        metaslab_class_t *mc;
 
        mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
+       mc->mc_spa = spa;
        mc->mc_rotor = NULL;
        mc->mc_ops = ops;
 
@@ -72,58 +92,73 @@ metaslab_class_create(space_map_ops_t *ops)
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
-       metaslab_group_t *mg;
-
-       while ((mg = mc->mc_rotor) != NULL) {
-               metaslab_class_remove(mc, mg);
-               metaslab_group_destroy(mg);
-       }
+       ASSERT(mc->mc_rotor == NULL);
+       ASSERT(mc->mc_alloc == 0);
+       ASSERT(mc->mc_deferred == 0);
+       ASSERT(mc->mc_space == 0);
+       ASSERT(mc->mc_dspace == 0);
 
        kmem_free(mc, sizeof (metaslab_class_t));
 }
 
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+int
+metaslab_class_validate(metaslab_class_t *mc)
 {
-       metaslab_group_t *mgprev, *mgnext;
+       metaslab_group_t *mg;
+       vdev_t *vd;
 
-       ASSERT(mg->mg_class == NULL);
+       /*
+        * Must hold one of the spa_config locks.
+        */
+       ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+           spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
-       if ((mgprev = mc->mc_rotor) == NULL) {
-               mg->mg_prev = mg;
-               mg->mg_next = mg;
-       } else {
-               mgnext = mgprev->mg_next;
-               mg->mg_prev = mgprev;
-               mg->mg_next = mgnext;
-               mgprev->mg_next = mg;
-               mgnext->mg_prev = mg;
-       }
-       mc->mc_rotor = mg;
-       mg->mg_class = mc;
+       if ((mg = mc->mc_rotor) == NULL)
+               return (0);
+
+       do {
+               vd = mg->mg_vd;
+               ASSERT(vd->vdev_mg != NULL);
+               ASSERT3P(vd->vdev_top, ==, vd);
+               ASSERT3P(mg->mg_class, ==, mc);
+               ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+       } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+       return (0);
 }
 
 void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
-       metaslab_group_t *mgprev, *mgnext;
+       atomic_add_64(&mc->mc_alloc, alloc_delta);
+       atomic_add_64(&mc->mc_deferred, defer_delta);
+       atomic_add_64(&mc->mc_space, space_delta);
+       atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
 
-       ASSERT(mg->mg_class == mc);
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+       return (mc->mc_alloc);
+}
 
-       mgprev = mg->mg_prev;
-       mgnext = mg->mg_next;
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+       return (mc->mc_deferred);
+}
 
-       if (mg == mgnext) {
-               mc->mc_rotor = NULL;
-       } else {
-               mc->mc_rotor = mgnext;
-               mgprev->mg_next = mgnext;
-               mgnext->mg_prev = mgprev;
-       }
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+       return (mc->mc_space);
+}
 
-       mg->mg_prev = NULL;
-       mg->mg_next = NULL;
-       mg->mg_class = NULL;
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+       return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 /*
@@ -164,9 +199,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
        mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
        avl_create(&mg->mg_metaslab_tree, metaslab_compare,
            sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
-       mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
        mg->mg_vd = vd;
-       metaslab_class_add(mc, mg);
+       mg->mg_class = mc;
+       mg->mg_activation_count = 0;
 
        return (mg);
 }
@@ -174,11 +209,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
+       ASSERT(mg->mg_prev == NULL);
+       ASSERT(mg->mg_next == NULL);
+       /*
+        * We may have gone below zero with the activation count
+        * either because we never activated in the first place or
+        * because we're done, and possibly removing the vdev.
+        */
+       ASSERT(mg->mg_activation_count <= 0);
+
        avl_destroy(&mg->mg_metaslab_tree);
        mutex_destroy(&mg->mg_lock);
        kmem_free(mg, sizeof (metaslab_group_t));
 }
 
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+       metaslab_class_t *mc = mg->mg_class;
+       metaslab_group_t *mgprev, *mgnext;
+
+       ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+       ASSERT(mc->mc_rotor != mg);
+       ASSERT(mg->mg_prev == NULL);
+       ASSERT(mg->mg_next == NULL);
+       ASSERT(mg->mg_activation_count <= 0);
+
+       if (++mg->mg_activation_count <= 0)
+               return;
+
+       mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+
+       if ((mgprev = mc->mc_rotor) == NULL) {
+               mg->mg_prev = mg;
+               mg->mg_next = mg;
+       } else {
+               mgnext = mgprev->mg_next;
+               mg->mg_prev = mgprev;
+               mg->mg_next = mgnext;
+               mgprev->mg_next = mg;
+               mgnext->mg_prev = mg;
+       }
+       mc->mc_rotor = mg;
+}
+
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+       metaslab_class_t *mc = mg->mg_class;
+       metaslab_group_t *mgprev, *mgnext;
+
+       ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+       if (--mg->mg_activation_count != 0) {
+               ASSERT(mc->mc_rotor != mg);
+               ASSERT(mg->mg_prev == NULL);
+               ASSERT(mg->mg_next == NULL);
+               ASSERT(mg->mg_activation_count < 0);
+               return;
+       }
+
+       mgprev = mg->mg_prev;
+       mgnext = mg->mg_next;
+
+       if (mg == mgnext) {
+               mc->mc_rotor = NULL;
+       } else {
+               mc->mc_rotor = mgnext;
+               mgprev->mg_next = mgnext;
+               mgnext->mg_prev = mgprev;
+       }
+
+       mg->mg_prev = NULL;
+       mg->mg_next = NULL;
+}
+
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
@@ -219,6 +325,32 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 }
 
 /*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+static int
+metaslab_segsize_compare(const void *x1, const void *x2)
+{
+       const space_seg_t *s1 = x1;
+       const space_seg_t *s2 = x2;
+       uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+       uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+       if (ss_size1 < ss_size2)
+               return (-1);
+       if (ss_size1 > ss_size2)
+               return (1);
+
+       if (s1->ss_start < s2->ss_start)
+               return (-1);
+       if (s1->ss_start > s2->ss_start)
+               return (1);
+
+       return (0);
+}
+
+/*
  * This is a helper function that can be used by the allocator to find
  * a suitable block to allocate. This will search the specified AVL
  * tree looking for a block that matches the specified criteria.
@@ -258,68 +390,58 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
        return (metaslab_block_picker(t, cursor, size, align));
 }
 
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
 static void
-metaslab_ff_load(space_map_t *sm)
+metaslab_pp_load(space_map_t *sm)
 {
+       space_seg_t *ss;
+
        ASSERT(sm->sm_ppd == NULL);
        sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-       sm->sm_pp_root = NULL;
+
+       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+       avl_create(sm->sm_pp_root, metaslab_segsize_compare,
+           sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+               avl_add(sm->sm_pp_root, ss);
 }
 
 static void
-metaslab_ff_unload(space_map_t *sm)
+metaslab_pp_unload(space_map_t *sm)
 {
+       void *cookie = NULL;
+
        kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
        sm->sm_ppd = NULL;
-}
 
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
-       avl_tree_t *t = &sm->sm_root;
-       uint64_t align = size & -size;
-       uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+       while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+               /* tear down the tree */
+       }
 
-       return (metaslab_block_picker(t, cursor, size, align));
+       avl_destroy(sm->sm_pp_root);
+       kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+       sm->sm_pp_root = NULL;
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 {
        /* No need to update cursor */
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 {
        /* No need to update cursor */
 }
 
-static space_map_ops_t metaslab_ff_ops = {
-       metaslab_ff_load,
-       metaslab_ff_unload,
-       metaslab_ff_alloc,
-       metaslab_ff_claim,
-       metaslab_ff_free,
-       NULL    /* maxsize */
-};
-
 /*
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * Return the maximum contiguous segment within the metaslab.
  */
-
 uint64_t
-metaslab_df_maxsize(space_map_t *sm)
+metaslab_pp_maxsize(space_map_t *sm)
 {
        avl_tree_t *t = sm->sm_pp_root;
        space_seg_t *ss;
@@ -330,67 +452,53 @@ metaslab_df_maxsize(space_map_t *sm)
        return (ss->ss_end - ss->ss_start);
 }
 
-static int
-metaslab_df_seg_compare(const void *x1, const void *x2)
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 {
-       const space_seg_t *s1 = x1;
-       const space_seg_t *s2 = x2;
-       uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-       uint64_t ss_size2 = s2->ss_end - s2->ss_start;
-
-       if (ss_size1 < ss_size2)
-               return (-1);
-       if (ss_size1 > ss_size2)
-               return (1);
-
-       if (s1->ss_start < s2->ss_start)
-               return (-1);
-       if (s1->ss_start > s2->ss_start)
-               return (1);
+       avl_tree_t *t = &sm->sm_root;
+       uint64_t align = size & -size;
+       uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 
-       return (0);
+       return (metaslab_block_picker(t, cursor, size, align));
 }
 
-static void
-metaslab_df_load(space_map_t *sm)
+/* ARGSUSED */
+boolean_t
+metaslab_ff_fragmented(space_map_t *sm)
 {
-       space_seg_t *ss;
-
-       ASSERT(sm->sm_ppd == NULL);
-       sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-
-       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-       avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
-           sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
-       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-               avl_add(sm->sm_pp_root, ss);
+       return (B_TRUE);
 }
 
-static void
-metaslab_df_unload(space_map_t *sm)
-{
-       void *cookie = NULL;
-
-       kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-       sm->sm_ppd = NULL;
-
-       while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-               /* tear down the tree */
-       }
-
-       avl_destroy(sm->sm_pp_root);
-       kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-       sm->sm_pp_root = NULL;
-}
+static space_map_ops_t metaslab_ff_ops = {
+       metaslab_pp_load,
+       metaslab_pp_unload,
+       metaslab_ff_alloc,
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_ff_fragmented
+};
 
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
 static uint64_t
 metaslab_df_alloc(space_map_t *sm, uint64_t size)
 {
        avl_tree_t *t = &sm->sm_root;
        uint64_t align = size & -size;
        uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-       uint64_t max_size = metaslab_df_maxsize(sm);
+       uint64_t max_size = metaslab_pp_maxsize(sm);
        int free_pct = sm->sm_space * 100 / sm->sm_size;
 
        ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -412,30 +520,158 @@ metaslab_df_alloc(space_map_t *sm, uint64_t size)
        return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
-/* ARGSUSED */
-static void
-metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+static boolean_t
+metaslab_df_fragmented(space_map_t *sm)
 {
-       /* No need to update cursor */
-}
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+       int free_pct = sm->sm_space * 100 / sm->sm_size;
 
-/* ARGSUSED */
-static void
-metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
-       /* No need to update cursor */
+       if (max_size >= metaslab_df_alloc_threshold &&
+           free_pct >= metaslab_df_free_pct)
+               return (B_FALSE);
+
+       return (B_TRUE);
 }
 
 static space_map_ops_t metaslab_df_ops = {
-       metaslab_df_load,
-       metaslab_df_unload,
+       metaslab_pp_load,
+       metaslab_pp_unload,
        metaslab_df_alloc,
-       metaslab_df_claim,
-       metaslab_df_free,
-       metaslab_df_maxsize
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_df_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+/*
+ * ==========================================================================
+ * Other experimental allocators
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+{
+       avl_tree_t *t = &sm->sm_root;
+       uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+       uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+       uint64_t rsize = size;
+       uint64_t offset = 0;
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+       if (max_size < size)
+               return (-1ULL);
+
+       ASSERT3U(*extent_end, >=, *cursor);
+
+       /*
+        * If we're running low on space switch to using the size
+        * sorted AVL tree (best-fit).
+        */
+       if ((*cursor + size) > *extent_end) {
+
+               t = sm->sm_pp_root;
+               *cursor = *extent_end = 0;
+
+               if (max_size > 2 * SPA_MAXBLOCKSIZE)
+                       rsize = MIN(metaslab_min_alloc_size, max_size);
+               offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
+               if (offset != -1)
+                       *cursor = offset + size;
+       } else {
+               offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+       }
+       ASSERT3U(*cursor, <=, *extent_end);
+       return (offset);
+}
+
+static boolean_t
+metaslab_cdf_fragmented(space_map_t *sm)
+{
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+
+       if (max_size > (metaslab_min_alloc_size * 10))
+               return (B_FALSE);
+       return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_cdf_ops = {
+       metaslab_pp_load,
+       metaslab_pp_unload,
+       metaslab_cdf_alloc,
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_cdf_fragmented
+};
+
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+{
+       avl_tree_t *t = &sm->sm_root;
+       avl_index_t where;
+       space_seg_t *ss, ssearch;
+       uint64_t hbit = highbit(size);
+       uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+       if (max_size < size)
+               return (-1ULL);
+
+       ssearch.ss_start = *cursor;
+       ssearch.ss_end = *cursor + size;
+
+       ss = avl_find(t, &ssearch, &where);
+       if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
+               t = sm->sm_pp_root;
+
+               ssearch.ss_start = 0;
+               ssearch.ss_end = MIN(max_size,
+                   1ULL << (hbit + metaslab_ndf_clump_shift));
+               ss = avl_find(t, &ssearch, &where);
+               if (ss == NULL)
+                       ss = avl_nearest(t, where, AVL_AFTER);
+               ASSERT(ss != NULL);
+       }
+
+       if (ss != NULL) {
+               if (ss->ss_start + size <= ss->ss_end) {
+                       *cursor = ss->ss_start + size;
+                       return (ss->ss_start);
+               }
+       }
+       return (-1ULL);
+}
+
+static boolean_t
+metaslab_ndf_fragmented(space_map_t *sm)
+{
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+
+       if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
+               return (B_FALSE);
+       return (B_TRUE);
+}
+
+
+static space_map_ops_t metaslab_ndf_ops = {
+       metaslab_pp_load,
+       metaslab_pp_unload,
+       metaslab_ndf_alloc,
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_ndf_fragmented
+};
+
+space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
 
 /*
  * ==========================================================================
@@ -466,6 +702,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 
        metaslab_group_add(mg, msp);
 
+       if (metaslab_debug && smo->smo_object != 0) {
+               mutex_enter(&msp->ms_lock);
+               VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+                   SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
+               mutex_exit(&msp->ms_lock);
+       }
+
        /*
         * If we're opening an existing pool (txg == 0) or creating
         * a new one (txg == TXG_INITIAL), all space is available now.
@@ -476,16 +719,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
                metaslab_sync_done(msp, 0);
 
        if (txg != 0) {
-               /*
-                * The vdev is dirty, but the metaslab isn't -- it just needs
-                * to have metaslab_sync_done() invoked from vdev_sync_done().
-                * [We could just dirty the metaslab, but that would cause us
-                * to allocate a space map object for it, which is wasteful
-                * and would mess up the locality logic in metaslab_weight().]
-                */
-               ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
                vdev_dirty(vd, 0, NULL, txg);
-               vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+               vdev_dirty(vd, VDD_METASLAB, msp, txg);
        }
 
        return (msp);
@@ -495,10 +730,9 @@ void
 metaslab_fini(metaslab_t *msp)
 {
        metaslab_group_t *mg = msp->ms_group;
-       int t;
 
-       vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-           -msp->ms_smo.smo_alloc, B_TRUE);
+       vdev_space_update(mg->mg_vd,
+           -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
 
        metaslab_group_remove(mg, msp);
 
@@ -507,11 +741,16 @@ metaslab_fini(metaslab_t *msp)
        space_map_unload(&msp->ms_map);
        space_map_destroy(&msp->ms_map);
 
-       for (t = 0; t < TXG_SIZE; t++) {
+       for (int t = 0; t < TXG_SIZE; t++) {
                space_map_destroy(&msp->ms_allocmap[t]);
                space_map_destroy(&msp->ms_freemap[t]);
        }
 
+       for (int t = 0; t < TXG_DEFER_SIZE; t++)
+               space_map_destroy(&msp->ms_defermap[t]);
+
+       ASSERT3S(msp->ms_deferspace, ==, 0);
+
        mutex_exit(&msp->ms_lock);
        mutex_destroy(&msp->ms_lock);
 
@@ -522,7 +761,6 @@ metaslab_fini(metaslab_t *msp)
 #define        METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 #define        METASLAB_ACTIVE_MASK            \
        (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define        METASLAB_SMO_BONUS_MULTIPLIER   2
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
@@ -555,36 +793,88 @@ metaslab_weight(metaslab_t *msp)
        ASSERT(weight >= space && weight <= 2 * space);
 
        /*
-        * For locality, assign higher weight to metaslabs we've used before.
+        * For locality, assign higher weight to metaslabs which have
+        * a lower offset than what we've already activated.
         */
-       if (smo->smo_object != 0)
-               weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+       if (sm->sm_start <= mg->mg_bonus_area)
+               weight *= (metaslab_smo_bonus_pct / 100);
        ASSERT(weight >= space &&
-           weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+           weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+
+       if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
+               /*
+                * If this metaslab is one we're actively using, adjust its
+                * weight to make it preferable to any inactive metaslab so
+                * we'll polish it off.
+                */
+               weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+       }
+       return (weight);
+}
+
+static void
+metaslab_prefetch(metaslab_group_t *mg)
+{
+       spa_t *spa = mg->mg_vd->vdev_spa;
+       metaslab_t *msp;
+       avl_tree_t *t = &mg->mg_metaslab_tree;
+       int m;
+
+       mutex_enter(&mg->mg_lock);
 
        /*
-        * If this metaslab is one we're actively using, adjust its weight to
-        * make it preferable to any inactive metaslab so we'll polish it off.
+        * Prefetch the next potential metaslabs
         */
-       weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+       for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
+               space_map_t *sm = &msp->ms_map;
+               space_map_obj_t *smo = &msp->ms_smo;
 
-       return (weight);
+               /* If we have reached our prefetch limit then we're done */
+               if (m >= metaslab_prefetch_limit)
+                       break;
+
+               if (!sm->sm_loaded && smo->smo_object != 0) {
+                       mutex_exit(&mg->mg_lock);
+                       dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
+                           0ULL, smo->smo_objsize);
+                       mutex_enter(&mg->mg_lock);
+               }
+       }
+       mutex_exit(&mg->mg_lock);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 {
+       metaslab_group_t *mg = msp->ms_group;
        space_map_t *sm = &msp->ms_map;
        space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
 
        if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-               int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
-                   msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
-               if (error) {
-                       metaslab_group_sort(msp->ms_group, msp, 0);
-                       return (error);
+               space_map_load_wait(sm);
+               if (!sm->sm_loaded) {
+                       int error = space_map_load(sm, sm_ops, SM_FREE,
+                           &msp->ms_smo,
+                           spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+                       if (error)  {
+                               metaslab_group_sort(msp->ms_group, msp, 0);
+                               return (error);
+                       }
+                       for (int t = 0; t < TXG_DEFER_SIZE; t++)
+                               space_map_walk(&msp->ms_defermap[t],
+                                   space_map_claim, sm);
+
+               }
+
+               /*
+                * Track the bonus area as we activate new metaslabs.
+                */
+               if (sm->sm_start > mg->mg_bonus_area) {
+                       mutex_enter(&mg->mg_lock);
+                       mg->mg_bonus_area = sm->sm_start;
+                       mutex_exit(&mg->mg_lock);
                }
 
                /*
@@ -624,7 +914,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
        vdev_t *vd = msp->ms_group->mg_vd;
        spa_t *spa = vd->vdev_spa;
-       objset_t *mos = spa->spa_meta_objset;
+       objset_t *mos = spa_meta_objset(spa);
        space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
        space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
        space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
@@ -632,9 +922,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        space_map_obj_t *smo = &msp->ms_smo_syncing;
        dmu_buf_t *db;
        dmu_tx_t *tx;
-       int t;
 
-       tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+       ASSERT(!vd->vdev_ishole);
+
+       if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+               return;
 
        /*
         * The only state that can actually be changing concurrently with
@@ -644,12 +936,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         * We drop it whenever we call into the DMU, because the DMU
         * can call down to us (e.g. via zio_free()) at any time.
         */
-       mutex_enter(&msp->ms_lock);
+
+       tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
        if (smo->smo_object == 0) {
                ASSERT(smo->smo_objsize == 0);
                ASSERT(smo->smo_alloc == 0);
-               mutex_exit(&msp->ms_lock);
                smo->smo_object = dmu_object_alloc(mos,
                    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
                    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
@@ -657,9 +949,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
                    (sm->sm_start >> vd->vdev_ms_shift),
                    sizeof (uint64_t), &smo->smo_object, tx);
-               mutex_enter(&msp->ms_lock);
        }
 
+       mutex_enter(&msp->ms_lock);
+
        space_map_walk(freemap, space_map_add, freed_map);
 
        if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
@@ -672,6 +965,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 * This metaslab is 100% allocated,
                 * minus the content of the in-core map (sm),
                 * minus what's been freed this txg (freed_map),
+                * minus deferred frees (ms_defermap[]),
                 * minus allocations from txgs in the future
                 * (because they haven't been committed yet).
                 */
@@ -683,7 +977,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                space_map_walk(sm, space_map_remove, allocmap);
                space_map_walk(freed_map, space_map_remove, allocmap);
 
-               for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+               for (int t = 0; t < TXG_DEFER_SIZE; t++)
+                       space_map_walk(&msp->ms_defermap[t],
+                           space_map_remove, allocmap);
+
+               for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
                        space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
                            space_map_remove, allocmap);
 
@@ -717,9 +1015,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
        space_map_obj_t *smosync = &msp->ms_smo_syncing;
        space_map_t *sm = &msp->ms_map;
        space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+       space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
        metaslab_group_t *mg = msp->ms_group;
        vdev_t *vd = mg->mg_vd;
-       int t;
+       int64_t alloc_delta, defer_delta;
+
+       ASSERT(!vd->vdev_ishole);
 
        mutex_enter(&msp->ms_lock);
 
@@ -728,16 +1029,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         * allocmaps and freemaps and add its capacity to the vdev.
         */
        if (freed_map->sm_size == 0) {
-               for (t = 0; t < TXG_SIZE; t++) {
+               for (int t = 0; t < TXG_SIZE; t++) {
                        space_map_create(&msp->ms_allocmap[t], sm->sm_start,
                            sm->sm_size, sm->sm_shift, sm->sm_lock);
                        space_map_create(&msp->ms_freemap[t], sm->sm_start,
                            sm->sm_size, sm->sm_shift, sm->sm_lock);
                }
-               vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+
+               for (int t = 0; t < TXG_DEFER_SIZE; t++)
+                       space_map_create(&msp->ms_defermap[t], sm->sm_start,
+                           sm->sm_size, sm->sm_shift, sm->sm_lock);
+
+               vdev_space_update(vd, 0, 0, sm->sm_size);
        }
 
-       vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+       alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+       defer_delta = freed_map->sm_space - defer_map->sm_space;
+
+       vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
        ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
        ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -745,13 +1054,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
        /*
         * If there's a space_map_load() in progress, wait for it to complete
         * so that we have a consistent view of the in-core space map.
-        * Then, add everything we freed in this txg to the map.
+        * Then, add defer_map (oldest deferred frees) to this map and
+        * transfer freed_map (this txg's frees) to defer_map.
         */
        space_map_load_wait(sm);
-       space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+       space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+       space_map_vacate(freed_map, space_map_add, defer_map);
 
        *smo = *smosync;
 
+       msp->ms_deferspace += defer_delta;
+       ASSERT3S(msp->ms_deferspace, >=, 0);
+       ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+       if (msp->ms_deferspace != 0) {
+               /*
+                * Keep syncing this metaslab until all deferred frees
+                * are back in circulation.
+                */
+               vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+       }
+
        /*
         * If the map is loaded but no longer active, evict it as soon as all
         * future allocations have synced.  (If we unloaded it now and then
@@ -760,11 +1082,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
        if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
                int evictable = 1;
 
-               for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+               for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
                        if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
                                evictable = 0;
 
-               if (evictable)
+               if (evictable && !metaslab_debug)
                        space_map_unload(sm);
        }
 
@@ -773,6 +1095,32 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
        mutex_exit(&msp->ms_lock);
 }
 
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+       vdev_t *vd = mg->mg_vd;
+
+       /*
+        * Re-evaluate all metaslabs which have lower offsets than the
+        * bonus area.
+        */
+       for (int m = 0; m < vd->vdev_ms_count; m++) {
+               metaslab_t *msp = vd->vdev_ms[m];
+
+               if (msp->ms_map.sm_start > mg->mg_bonus_area)
+                       break;
+
+               mutex_enter(&msp->ms_lock);
+               metaslab_group_sort(mg, msp, metaslab_weight(msp));
+               mutex_exit(&msp->ms_lock);
+       }
+
+       /*
+        * Prefetch the next potential metaslabs
+        */
+       metaslab_prefetch(mg);
+}
+
 static uint64_t
 metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
@@ -868,7 +1216,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
                if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
                        break;
 
-               metaslab_passivate(msp, size - 1);
+               metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
                mutex_exit(&msp->ms_lock);
        }
@@ -905,12 +1253,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
        /*
         * For testing, make some blocks above a certain size be gang blocks.
         */
-       if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
+       if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
                return (ENOSPC);
 
        /*
         * Start at the rotor and loop through all mgs until we find something.
-        * Note that there's no locking on mc_rotor or mc_allocated because
+        * Note that there's no locking on mc_rotor or mc_aliquot because
         * nothing actually breaks if we miss a few updates -- we just won't
         * allocate quite as evenly.  It all balances out over time.
         *
@@ -932,10 +1280,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
         */
        if (hintdva) {
                vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-               if (flags & METASLAB_HINTBP_AVOID)
-                       mg = vd->vdev_mg->mg_next;
-               else
+
+               /*
+                * It's possible the vdev we're using as the hint no
+                * longer exists (i.e. removed). Consult the rotor when
+                * all else fails.
+                */
+               if (vd != NULL) {
                        mg = vd->vdev_mg;
+
+                       if (flags & METASLAB_HINTBP_AVOID &&
+                           mg->mg_next != NULL)
+                               mg = mg->mg_next;
+               } else {
+                       mg = mc->mc_rotor;
+               }
        } else if (d != 0) {
                vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
                mg = vd->vdev_mg->mg_next;
@@ -944,15 +1303,18 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
        }
 
        /*
-        * If the hint put us into the wrong class, just follow the rotor.
+        * If the hint put us into the wrong metaslab class, or into a
+        * metaslab group that has been passivated, just follow the rotor.
         */
-       if (mg->mg_class != mc)
+       if (mg->mg_class != mc || mg->mg_activation_count <= 0)
                mg = mc->mc_rotor;
 
        rotor = mg;
 top:
        all_zero = B_TRUE;
        do {
+               ASSERT(mg->mg_activation_count == 1);
+
                vd = mg->mg_vd;
 
                /*
@@ -997,32 +1359,28 @@ top:
                         * over- or under-used relative to the pool,
                         * and set an allocation bias to even it out.
                         */
-                       if (mc->mc_allocated == 0) {
+                       if (mc->mc_aliquot == 0) {
                                vdev_stat_t *vs = &vd->vdev_stat;
-                               uint64_t alloc, space;
-                               int64_t vu, su;
-
-                               alloc = spa_get_alloc(spa);
-                               space = spa_get_space(spa);
+                               int64_t vu, cu;
 
                                /*
                                 * Determine percent used in units of 0..1024.
                                 * (This is just to avoid floating point.)
                                 */
                                vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-                               su = (alloc << 10) / (space + 1);
+                               cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
 
                                /*
                                 * Bias by at most +/- 25% of the aliquot.
                                 */
-                               mg->mg_bias = ((su - vu) *
+                               mg->mg_bias = ((cu - vu) *
                                    (int64_t)mg->mg_aliquot) / (1024 * 4);
                        }
 
-                       if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+                       if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
                            mg->mg_aliquot + mg->mg_bias) {
                                mc->mc_rotor = mg->mg_next;
-                               mc->mc_allocated = 0;
+                               mc->mc_aliquot = 0;
                        }
 
                        DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -1034,7 +1392,7 @@ top:
                }
 next:
                mc->mc_rotor = mg->mg_next;
-               mc->mc_allocated = 0;
+               mc->mc_aliquot = 0;
        } while ((mg = mg->mg_next) != rotor);
 
        if (!all_zero) {
@@ -1114,7 +1472,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
        uint64_t size = DVA_GET_ASIZE(dva);
        vdev_t *vd;
        metaslab_t *msp;
-       int error;
+       int error = 0;
 
        ASSERT(DVA_IS_VALID(dva));
 
@@ -1129,7 +1487,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 
        mutex_enter(&msp->ms_lock);
 
-       error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+       if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+               error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+
+       if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+               error = ENOENT;
+
        if (error || txg == 0) {        /* txg == 0 indicates dry run */
                mutex_exit(&msp->ms_lock);
                return (error);
@@ -1157,6 +1520,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
        int error = 0;
 
        ASSERT(bp->blk_birth == 0);
+       ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
 
        spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -1186,7 +1550,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 
        spa_config_exit(spa, SCL_ALLOC, FTAG);
 
-       bp->blk_birth = txg;
+       BP_SET_BIRTH(bp, txg, txg);
 
        return (0);
 }
@@ -1198,7 +1562,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
        int ndvas = BP_GET_NDVAS(bp);
 
        ASSERT(!BP_IS_HOLE(bp));
-       ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+       ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
        spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
index f1b3b23..8358b4c 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
new file mode 100644 (file)
index 0000000..a91b379
--- /dev/null
@@ -0,0 +1,1886 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/dnode.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode.  The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations.  The spill block will be sized to fit the data
+ * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer.  Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function.  The names are only used
+ * during registration.  All  attributes are known by their unique attribute
+ * id value.  If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually.  Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout.  The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is whats
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill).  If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used.  This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attribues.  The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes.  Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once.  sa_replace_all_by_template() is
+ * used to set an array of attributes.  This is used by the ZPL when
+ * creating a brand new file.  The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing.  The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping.  Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available.  Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+    uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+    void *data);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+    int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+    uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t *sa_bswap_table[] = {
+       byteswap_uint64_array,
+       byteswap_uint32_array,
+       byteswap_uint16_array,
+       byteswap_uint8_array,
+       zfs_acl_byteswap,
+};
+
+#define        SA_COPY_DATA(f, s, t, l) \
+       { \
+               if (f == NULL) { \
+                       if (l == 8) { \
+                               *(uint64_t *)t = *(uint64_t *)s; \
+                       } else if (l == 16) { \
+                               *(uint64_t *)t = *(uint64_t *)s; \
+                               *(uint64_t *)((uintptr_t)t + 8) = \
+                                   *(uint64_t *)((uintptr_t)s + 8); \
+                       } else { \
+                               bcopy(s, t, l); \
+                       } \
+               } else \
+                       sa_copy_data(f, s, t, l); \
+       }
+
+/*
+ * This table is fixed and cannot be changed.  Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes.  These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+       {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+       {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+       {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+       {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+       {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+       {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+       {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+       {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+       {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+       {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+       {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+       {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+       {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+       {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+       {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+       {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * ZPL legacy layout
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = 16;
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+       sa_handle_t *hdl = buf;
+
+       hdl->sa_bonus_tab = NULL;
+       hdl->sa_spill_tab = NULL;
+       hdl->sa_os = NULL;
+       hdl->sa_userp = NULL;
+       hdl->sa_bonus = NULL;
+       hdl->sa_spill = NULL;
+       mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+       return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+       sa_handle_t *hdl = buf;
+       mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+       sa_cache = kmem_cache_create("sa_cache",
+           sizeof (sa_handle_t), 0, sa_cache_constructor,
+           sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+       if (sa_cache)
+               kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+       const sa_lot_t *node1 = arg1;
+       const sa_lot_t *node2 = arg2;
+
+       if (node1->lot_num > node2->lot_num)
+               return (1);
+       else if (node1->lot_num < node2->lot_num)
+               return (-1);
+       return (0);
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+       const sa_lot_t *node1 = arg1;
+       const sa_lot_t *node2 = arg2;
+
+       if (node1->lot_hash > node2->lot_hash)
+               return (1);
+       if (node1->lot_hash < node2->lot_hash)
+               return (-1);
+       if (node1->lot_instance > node2->lot_instance)
+               return (1);
+       if (node1->lot_instance < node2->lot_instance)
+               return (-1);
+       return (0);
+}
+
+boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+       int i;
+
+       if (count != tbf->lot_attr_count)
+               return (1);
+
+       for (i = 0; i != count; i++) {
+               if (attrs[i] != tbf->lot_attrs[i])
+                       return (1);
+       }
+       return (0);
+}
+
+#define        SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+       int i;
+       uint64_t crc = -1ULL;
+
+       for (i = 0; i != attr_count; i++)
+               crc ^= SA_ATTR_HASH(attrs[i]);
+
+       return (crc);
+}
+
+static boolean_t
+sa_has_blkptr(sa_handle_t *hdl)
+{
+       int rc;
+       if (hdl->sa_spill == NULL) {
+               if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+                   &hdl->sa_spill)) == 0)
+                       VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+       } else {
+               rc = 0;
+       }
+
+       return (rc == 0 ? B_TRUE : B_FALSE);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+    sa_data_op_t data_op, dmu_tx_t *tx)
+{
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       int i;
+       int error = 0;
+       sa_buf_type_t buftypes;
+
+       buftypes = 0;
+
+       ASSERT(count > 0);
+       for (i = 0; i != count; i++) {
+               ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+               bulk[i].sa_addr = NULL;
+               /* First check the bonus buffer */
+
+               if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+                   hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+                       SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+                           SA_GET_HDR(hdl, SA_BONUS),
+                           bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+                       if (tx && !(buftypes & SA_BONUS)) {
+                               dmu_buf_will_dirty(hdl->sa_bonus, tx);
+                               buftypes |= SA_BONUS;
+                       }
+               }
+               if (bulk[i].sa_addr == NULL && sa_has_blkptr(hdl)) {
+                       if (TOC_ATTR_PRESENT(
+                           hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+                               SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+                                   SA_GET_HDR(hdl, SA_SPILL),
+                                   bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+                               if (tx && !(buftypes & SA_SPILL) &&
+                                   bulk[i].sa_size == bulk[i].sa_length) {
+                                       dmu_buf_will_dirty(hdl->sa_spill, tx);
+                                       buftypes |= SA_SPILL;
+                               }
+                       }
+               }
+               switch (data_op) {
+               case SA_LOOKUP:
+                       if (bulk[i].sa_addr == NULL)
+                               return (ENOENT);
+                       if (bulk[i].sa_data) {
+                               SA_COPY_DATA(bulk[i].sa_data_func,
+                                   bulk[i].sa_addr, bulk[i].sa_data,
+                                   bulk[i].sa_size);
+                       }
+                       continue;
+
+               case SA_UPDATE:
+                       /* existing rewrite of attr */
+                       if (bulk[i].sa_addr &&
+                           bulk[i].sa_size == bulk[i].sa_length) {
+                               SA_COPY_DATA(bulk[i].sa_data_func,
+                                   bulk[i].sa_data, bulk[i].sa_addr,
+                                   bulk[i].sa_length);
+                               continue;
+                       } else if (bulk[i].sa_addr) { /* attr size change */
+                               error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+                                   SA_REPLACE, bulk[i].sa_data_func,
+                                   bulk[i].sa_data, bulk[i].sa_length, tx);
+                       } else { /* adding new attribute */
+                               error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+                                   SA_ADD, bulk[i].sa_data_func,
+                                   bulk[i].sa_data, bulk[i].sa_length, tx);
+                       }
+                       if (error)
+                               return (error);
+                       break;
+               }
+       }
+       return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+    uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+       sa_os_t *sa = os->os_sa;
+       sa_lot_t *tb, *findtb;
+       int i;
+       avl_index_t loc;
+
+       ASSERT(MUTEX_HELD(&sa->sa_lock));
+       tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+       tb->lot_attr_count = attr_count;
+       tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+           KM_SLEEP);
+       bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+       tb->lot_num = lot_num;
+       tb->lot_hash = hash;
+       tb->lot_instance = 0;
+
+       if (zapadd) {
+               char attr_name[8];
+
+               if (sa->sa_layout_attr_obj == 0) {
+                       int error;
+                       sa->sa_layout_attr_obj = zap_create(os,
+                           DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
+                       error = zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
+                           &sa->sa_layout_attr_obj, tx);
+                       ASSERT3U(error, ==, 0);
+               }
+
+               (void) snprintf(attr_name, sizeof (attr_name),
+                   "%d", (int)lot_num);
+               VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+                   attr_name, 2, attr_count, attrs, tx));
+       }
+
+       list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+           offsetof(sa_idx_tab_t, sa_next));
+
+       for (i = 0; i != attr_count; i++) {
+               if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+                       tb->lot_var_sizes++;
+       }
+
+       avl_add(&sa->sa_layout_num_tree, tb);
+
+       /* verify we don't have a hash collision */
+       if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+               for (; findtb && findtb->lot_hash == hash;
+                   findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+                       if (findtb->lot_instance != tb->lot_instance)
+                               break;
+                       tb->lot_instance++;
+               }
+       }
+       avl_add(&sa->sa_layout_hash_tree, tb);
+       return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+    int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+       sa_lot_t *tb, tbsearch;
+       avl_index_t loc;
+       sa_os_t *sa = os->os_sa;
+       boolean_t found = B_FALSE;
+
+       mutex_enter(&sa->sa_lock);
+       tbsearch.lot_hash = hash;
+       tbsearch.lot_instance = 0;
+       tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+       if (tb) {
+               for (; tb && tb->lot_hash == hash;
+                   tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+                       if (sa_layout_equal(tb, attrs, count) == 0) {
+                               found = B_TRUE;
+                               break;
+                       }
+               }
+       }
+       if (!found) {
+               tb = sa_add_layout_entry(os, attrs, count,
+                   avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+       }
+       mutex_exit(&sa->sa_lock);
+       *lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+       int error;
+       uint32_t blocksize;
+
+       if (size == 0) {
+               blocksize = SPA_MINBLOCKSIZE;
+       } else if (size > SPA_MAXBLOCKSIZE) {
+               ASSERT(0);
+               return (EFBIG);
+       } else {
+               blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+       }
+
+       error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+       ASSERT(error == 0);
+       return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+       if (func == NULL) {
+               bcopy(datastart, target, buflen);
+       } else {
+               boolean_t start;
+               int bytes;
+               void *dataptr;
+               void *saptr = target;
+               uint32_t length;
+
+               start = B_TRUE;
+               bytes = 0;
+               while (bytes < buflen) {
+                       func(&dataptr, &length, buflen, start, datastart);
+                       bcopy(dataptr, saptr, length);
+                       saptr = (void *)((caddr_t)saptr + length);
+                       bytes += length;
+                       start = B_FALSE;
+               }
+       }
+}
+
+/*
+ * Determine several different sizes
+ * first the sa header size
+ * the number of bytes to be stored
+ * if spill would occur the index in the attribute array is returned
+ *
+ * the boolean will_spill will be set when spilling is necessary.  It
+ * is only set when the buftype is SA_BONUS
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+    dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
+    boolean_t *will_spill)
+{
+       int var_size = 0;
+       int i;
+       int full_space;
+       int hdrsize;
+       boolean_t done = B_FALSE;
+
+       if (buftype == SA_BONUS && sa->sa_force_spill) {
+               *total = 0;
+               *index = 0;
+               *will_spill = B_TRUE;
+               return (0);
+       }
+
+       *index = -1;
+       *total = 0;
+
+       if (buftype == SA_BONUS)
+               *will_spill = B_FALSE;
+
+       hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+           sizeof (sa_hdr_phys_t);
+
+       full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
+
+       for (i = 0; i != attr_count; i++) {
+               boolean_t is_var_sz;
+
+               *total += attr_desc[i].sa_length;
+               if (done)
+                       goto next;
+
+               is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+               if (is_var_sz) {
+                       var_size++;
+               }
+
+               if (is_var_sz && var_size > 1) {
+                       if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+                           *total < full_space) {
+                               hdrsize += sizeof (uint16_t);
+                       } else {
+                               done = B_TRUE;
+                               *index = i;
+                               if (buftype == SA_BONUS)
+                                       *will_spill = B_TRUE;
+                               continue;
+                       }
+               }
+
+               /*
+                * find index of where spill *could* occur.
+                * Then continue to count of remainder attribute
+                * space.  The sum is used later for sizing bonus
+                * and spill buffer.
+                */
+               if (buftype == SA_BONUS && *index == -1 &&
+                   P2ROUNDUP(*total + hdrsize, 8) >
+                   (full_space - sizeof (blkptr_t))) {
+                       *index = i;
+                       done = B_TRUE;
+               }
+
+next:
+               if (P2ROUNDUP(*total + hdrsize, 8) > full_space &&
+                   buftype == SA_BONUS)
+                       *will_spill = B_TRUE;
+       }
+
+       hdrsize = P2ROUNDUP(hdrsize, 8);
+       return (hdrsize);
+}
+
+#define        BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+    dmu_tx_t *tx)
+{
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       uint64_t hash;
+       sa_buf_type_t buftype;
+       sa_hdr_phys_t *sahdr;
+       void *data_start;
+       int buf_space;
+       sa_attr_type_t *attrs, *attrs_start;
+       int i, lot_count;
+       int hdrsize, spillhdrsize;
+       int used;
+       dmu_object_type_t bonustype;
+       sa_lot_t *lot;
+       int len_idx;
+       int spill_used;
+       boolean_t spilling;
+
+       dmu_buf_will_dirty(hdl->sa_bonus, tx);
+       bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+
+       /* first determine bonus header size and sum of all attributes */
+       hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+           SA_BONUS, &i, &used, &spilling);
+
+       if (used > SPA_MAXBLOCKSIZE)
+               return (EFBIG);
+
+       VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
+           MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+           used + hdrsize, tx));
+
+       ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+           bonustype == DMU_OT_SA);
+
+       /* setup and size spill buffer when needed */
+       if (spilling) {
+               boolean_t dummy;
+
+               if (hdl->sa_spill == NULL) {
+                       int error;
+                       error = dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+                           &hdl->sa_spill);
+                       ASSERT3U(error, ==, 0);
+               }
+               dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+               spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
+                   attr_count - i, hdl->sa_spill, SA_SPILL, &i,
+                   &spill_used, &dummy);
+
+               if (spill_used > SPA_MAXBLOCKSIZE)
+                       return (EFBIG);
+
+               buf_space = hdl->sa_spill->db_size - spillhdrsize;
+               if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+                   hdl->sa_spill->db_size)
+                       VERIFY(0 == sa_resize_spill(hdl,
+                           BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+       }
+
+       /* setup starting pointers to lay down data */
+       data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+       sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+       buftype = SA_BONUS;
+
+       if (spilling)
+               buf_space = (sa->sa_force_spill) ?
+                   0 : SA_BLKPTR_SPACE - hdrsize;
+       else
+               buf_space = hdl->sa_bonus->db_size - hdrsize;
+
+       attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+           KM_SLEEP);
+       lot_count = 0;
+
+       for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+               uint16_t length;
+
+               attrs[i] = attr_desc[i].sa_attr;
+               length = SA_REGISTERED_LEN(sa, attrs[i]);
+               if (length == 0)
+                       length = attr_desc[i].sa_length;
+
+               if (buf_space < length) {  /* switch to spill buffer */
+                       ASSERT(bonustype != DMU_OT_ZNODE);
+                       if (buftype == SA_BONUS && !sa->sa_force_spill) {
+                               sa_find_layout(hdl->sa_os, hash, attrs_start,
+                                   lot_count, tx, &lot);
+                               SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+                       }
+
+                       buftype = SA_SPILL;
+                       hash = -1ULL;
+                       len_idx = 0;
+
+                       sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+                       sahdr->sa_magic = SA_MAGIC;
+                       data_start = (void *)((uintptr_t)sahdr +
+                           spillhdrsize);
+                       attrs_start = &attrs[i];
+                       buf_space = hdl->sa_spill->db_size - spillhdrsize;
+                       lot_count = 0;
+               }
+               hash ^= SA_ATTR_HASH(attrs[i]);
+               attr_desc[i].sa_addr = data_start;
+               attr_desc[i].sa_size = length;
+               SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+                   data_start, length);
+               if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+                       sahdr->sa_lengths[len_idx++] = length;
+               }
+               data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+                   length), 8);
+               buf_space -= P2ROUNDUP(length, 8);
+               lot_count++;
+       }
+
+       sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+       if (bonustype == DMU_OT_SA) {
+               SA_SET_HDR(sahdr, lot->lot_num,
+                   buftype == SA_BONUS ? hdrsize : spillhdrsize);
+       }
+
+       kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+       if (hdl->sa_bonus_tab) {
+               sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+               hdl->sa_bonus_tab = NULL;
+       }
+       if (!sa->sa_force_spill)
+               VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+       if (hdl->sa_spill) {
+               sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+               if (!spilling) {
+                       /*
+                        * remove spill block that is no longer needed.
+                        * set sa_spill_remove to prevent sa_attr_op
+                        * from trying to retrieve spill block before its
+                        * been removed.  The flag will be cleared if/when
+                        * the handle is destroyed recreated or
+                        * sa_build_layouts() needs to spill again.
+                        */
+                       dmu_buf_rele(hdl->sa_spill, NULL);
+                       hdl->sa_spill = NULL;
+                       hdl->sa_spill_tab = NULL;
+                       VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+                           sa_handle_object(hdl), tx));
+               } else {
+                       VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+               }
+       }
+
+       return (0);
+}
+
+static void
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+       sa_os_t *sa = os->os_sa;
+       uint64_t sa_attr_count = 0;
+       int error = 0;
+       uint64_t attr_value;
+       sa_attr_table_t *tb;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       int registered_count = 0;
+       int i;
+       dmu_objset_type_t ostype = dmu_objset_type(os);
+
+       sa->sa_user_table =
+           kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+       sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+       if (sa->sa_reg_attr_obj != 0)
+               VERIFY(zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count) == 0);
+
+       if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+               sa_attr_count += sa_legacy_attr_count;
+
+       /* Allocate attribute numbers for attributes that aren't registered */
+       for (i = 0; i != count; i++) {
+               boolean_t found = B_FALSE;
+               int j;
+
+               if (ostype == DMU_OST_ZFS) {
+                       for (j = 0; j != sa_legacy_attr_count; j++) {
+                               if (strcmp(reg_attrs[i].sa_name,
+                                   sa_legacy_attrs[j].sa_name) == 0) {
+                                       sa->sa_user_table[i] =
+                                           sa_legacy_attrs[j].sa_attr;
+                                       found = B_TRUE;
+                               }
+                       }
+               }
+               if (found)
+                       continue;
+
+               if (sa->sa_reg_attr_obj)
+                       error = zap_lookup(os, sa->sa_reg_attr_obj,
+                           reg_attrs[i].sa_name, 8, 1, &attr_value);
+               else
+                       error = ENOENT;
+               switch (error) {
+               default:
+               case ENOENT:
+                       sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+                       sa_attr_count++;
+                       break;
+               case 0:
+                       sa->sa_user_table[i] = ATTR_NUM(attr_value);
+                       break;
+               }
+       }
+
+       os->os_sa->sa_num_attrs = sa_attr_count;
+       tb = os->os_sa->sa_attr_table =
+           kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+       /*
+        * Attribute table is constructed from requested attribute list,
+        * previously foreign registered attributes, and also the legacy
+        * ZPL set of attributes.
+        */
+
+       if (sa->sa_reg_attr_obj) {
+               for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       uint64_t value;
+                       value  = za.za_first_integer;
+
+                       registered_count++;
+                       tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+                       tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+                       tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+                       tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+                       if (tb[ATTR_NUM(value)].sa_name) {
+                               continue;
+                       }
+                       tb[ATTR_NUM(value)].sa_name =
+                           kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+                       (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+                           strlen(za.za_name) +1);
+               }
+               zap_cursor_fini(&zc);
+       }
+
+       if (ostype == DMU_OST_ZFS) {
+               for (i = 0; i != sa_legacy_attr_count; i++) {
+                       if (tb[i].sa_name)
+                               continue;
+                       tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+                       tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+                       tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+                       tb[i].sa_registered = B_FALSE;
+                       tb[i].sa_name =
+                           kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+                           KM_SLEEP);
+                       (void) strlcpy(tb[i].sa_name,
+                           sa_legacy_attrs[i].sa_name,
+                           strlen(sa_legacy_attrs[i].sa_name) + 1);
+               }
+       }
+
+       for (i = 0; i != count; i++) {
+               sa_attr_type_t attr_id;
+
+               attr_id = sa->sa_user_table[i];
+               if (tb[attr_id].sa_name)
+                       continue;
+
+               tb[attr_id].sa_length = reg_attrs[i].sa_length;
+               tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+               tb[attr_id].sa_attr = attr_id;
+               tb[attr_id].sa_name =
+                   kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+               (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+                   strlen(reg_attrs[i].sa_name) + 1);
+       }
+
+       os->os_sa->sa_need_attr_registration =
+           (sa_attr_count != registered_count);
+}
+
+sa_attr_type_t *
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       sa_os_t *sa;
+       dmu_objset_type_t ostype = dmu_objset_type(os);
+       sa_attr_type_t *tb;
+
+       mutex_enter(&os->os_lock);
+       if (os->os_sa) {
+               mutex_enter(&os->os_sa->sa_lock);
+               mutex_exit(&os->os_lock);
+               tb = os->os_sa->sa_user_table;
+               mutex_exit(&os->os_sa->sa_lock);
+               return (tb);
+       }
+
+       sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+       mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+       sa->sa_master_obj = sa_obj;
+
+       mutex_enter(&sa->sa_lock);
+       mutex_exit(&os->os_lock);
+       avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+           sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+       avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+           sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+       if (sa_obj) {
+               int error;
+               error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+                   8, 1, &sa->sa_layout_attr_obj);
+               if (error != 0 && error != ENOENT) {
+                       return (NULL);
+               }
+               error = zap_lookup(os, sa_obj, SA_REGISTRY,
+                   8, 1, &sa->sa_reg_attr_obj);
+               if (error != 0 && error != ENOENT) {
+                       mutex_exit(&sa->sa_lock);
+                       return (NULL);
+               }
+       }
+
+       os->os_sa = sa;
+       sa_attr_table_setup(os, reg_attrs, count);
+
+       if (sa->sa_layout_attr_obj != 0) {
+               for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       sa_attr_type_t *lot_attrs;
+                       uint64_t lot_num;
+
+                       lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+                           za.za_num_integers, KM_SLEEP);
+
+                       VERIFY(zap_lookup(os, sa->sa_layout_attr_obj,
+                           za.za_name, 2, za.za_num_integers, lot_attrs) == 0);
+                       VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+                           (unsigned long long *)&lot_num) == 0);
+
+                       (void) sa_add_layout_entry(os, lot_attrs,
+                           za.za_num_integers, lot_num,
+                           sa_layout_info_hash(lot_attrs,
+                           za.za_num_integers), B_FALSE, NULL);
+                       kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+                           za.za_num_integers);
+               }
+               zap_cursor_fini(&zc);
+       }
+
+       /* Add special layout number for old ZNODES */
+       if (ostype == DMU_OST_ZFS) {
+               (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+                   sa_legacy_attr_count, 0,
+                   sa_layout_info_hash(sa_legacy_zpl_layout,
+                   sa_legacy_attr_count), B_FALSE, NULL);
+
+               (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+                   0, B_FALSE, NULL);
+       }
+       mutex_exit(&sa->sa_lock);
+       return (os->os_sa->sa_user_table);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+       sa_os_t *sa = os->os_sa;
+       sa_lot_t *layout;
+       void *cookie;
+       int i;
+
+       kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+       /* Free up attr table */
+
+       for (i = 0; i != sa->sa_num_attrs; i++) {
+               if (sa->sa_attr_table[i].sa_name)
+                       kmem_free(sa->sa_attr_table[i].sa_name,
+                           strlen(sa->sa_attr_table[i].sa_name) + 1);
+       }
+
+       kmem_free(sa->sa_attr_table,
+           sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+       cookie = NULL;
+       while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
+               sa_idx_tab_t *tab;
+               while (tab = list_head(&layout->lot_idx_tab)) {
+                       ASSERT(refcount_count(&tab->sa_refcount));
+                       sa_idx_tab_rele(os, tab);
+               }
+       }
+
+       cookie = NULL;
+       while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
+               kmem_free(layout->lot_attrs,
+                   sizeof (sa_attr_type_t) * layout->lot_attr_count);
+               kmem_free(layout, sizeof (sa_lot_t));
+       }
+
+       avl_destroy(&sa->sa_layout_hash_tree);
+       avl_destroy(&sa->sa_layout_num_tree);
+
+       kmem_free(sa, sizeof (sa_os_t));
+       os->os_sa = NULL;
+}
+
+void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+    uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+       sa_idx_tab_t *idx_tab = userp;
+
+       if (var_length) {
+               ASSERT(idx_tab->sa_variable_lengths);
+               idx_tab->sa_variable_lengths[length_idx] = length;
+       }
+       TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+           (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+    sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+       void *data_start;
+       sa_lot_t *tb = tab;
+       sa_lot_t search;
+       avl_index_t loc;
+       sa_os_t *sa = os->os_sa;
+       int i;
+       uint16_t *length_start = NULL;
+       uint8_t length_idx = 0;
+
+       if (tab == NULL) {
+               search.lot_num = SA_LAYOUT_NUM(hdr, type);
+               tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+               ASSERT(tb);
+       }
+
+       if (IS_SA_BONUSTYPE(type)) {
+               data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+                   offsetof(sa_hdr_phys_t, sa_lengths) +
+                   (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+               length_start = hdr->sa_lengths;
+       } else {
+               data_start = hdr;
+       }
+
+       for (i = 0; i != tb->lot_attr_count; i++) {
+               int attr_length, reg_length;
+               uint8_t idx_len;
+
+               reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+               if (reg_length) {
+                       attr_length = reg_length;
+                       idx_len = 0;
+               } else {
+                       attr_length = length_start[length_idx];
+                       idx_len = length_idx++;
+               }
+
+               func(hdr, data_start, tb->lot_attrs[i], attr_length,
+                   idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+               data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+                   attr_length), 8);
+       }
+}
+
+/*ARGSUSED*/
+void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+    uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+       sa_handle_t *hdl = userp;
+       sa_os_t *sa = hdl->sa_os->os_sa;
+
+       sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+       sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+       dmu_buf_impl_t *db;
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       int num_lengths = 1;
+       int i;
+
+       ASSERT(MUTEX_HELD(&sa->sa_lock));
+       if (sa_hdr_phys->sa_magic == SA_MAGIC)
+               return;
+
+       db = SA_GET_DB(hdl, buftype);
+
+       if (buftype == SA_SPILL) {
+               arc_release(db->db_buf, NULL);
+               arc_buf_thaw(db->db_buf);
+       }
+
+       sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+       sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+       /*
+        * Determine number of variable lenghts in header
+        * The standard 8 byte header has one for free and a
+        * 16 byte header would have 4 + 1;
+        */
+       if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+               num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+       for (i = 0; i != num_lengths; i++)
+               sa_hdr_phys->sa_lengths[i] =
+                   BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+       sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+           sa_byteswap_cb, NULL, hdl);
+
+       if (buftype == SA_SPILL)
+               arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+       sa_hdr_phys_t *sa_hdr_phys;
+       dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+       dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       sa_idx_tab_t *idx_tab;
+
+       sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+       mutex_enter(&sa->sa_lock);
+
+       /* Do we need to byteswap? */
+
+       /* only check if not old znode */
+       if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+           sa_hdr_phys->sa_magic != 0) {
+               VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
+               sa_byteswap(hdl, buftype);
+       }
+
+       idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+       if (buftype == SA_BONUS)
+               hdl->sa_bonus_tab = idx_tab;
+       else
+               hdl->sa_spill_tab = idx_tab;
+
+       mutex_exit(&sa->sa_lock);
+       return (0);
+}
+
+/*ARGSUSED*/
+void
+sa_evict(dmu_buf_t *db, void *sap)
+{
+       panic("evicting sa dbuf %p\n", (void *)db);
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+       sa_os_t *sa = os->os_sa;
+       sa_idx_tab_t *idx_tab = arg;
+
+       if (idx_tab == NULL)
+               return;
+
+       mutex_enter(&sa->sa_lock);
+       if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+               list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+               if (idx_tab->sa_variable_lengths)
+                       kmem_free(idx_tab->sa_variable_lengths,
+                           sizeof (uint16_t) *
+                           idx_tab->sa_layout->lot_var_sizes);
+               refcount_destroy(&idx_tab->sa_refcount);
+               kmem_free(idx_tab->sa_idx_tab,
+                   sizeof (uint32_t) * sa->sa_num_attrs);
+               kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+       }
+       mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+       sa_os_t *sa = os->os_sa;
+
+       ASSERT(MUTEX_HELD(&sa->sa_lock));
+       (void) refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+       mutex_enter(&hdl->sa_lock);
+       (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
+           NULL, NULL, NULL);
+
+       if (hdl->sa_bonus_tab) {
+               sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+               hdl->sa_bonus_tab = NULL;
+       }
+       if (hdl->sa_spill_tab) {
+               sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+               hdl->sa_spill_tab = NULL;
+       }
+
+       dmu_buf_rele(hdl->sa_bonus, NULL);
+
+       if (hdl->sa_spill)
+               dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+       mutex_exit(&hdl->sa_lock);
+
+       kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+       int error = 0;
+       dmu_object_info_t doi;
+       sa_handle_t *handle;
+
+#ifdef ZFS_DEBUG
+       dmu_object_info_from_db(db, &doi);
+       ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+           doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+       /* find handle, if it exists */
+       /* if one doesn't exist then create a new one, and initialize it */
+
+       handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+       if (handle == NULL) {
+               sa_handle_t *newhandle;
+               handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+               handle->sa_userp = userp;
+               handle->sa_bonus = db;
+               handle->sa_os = os;
+               handle->sa_spill = NULL;
+
+               error = sa_build_index(handle, SA_BONUS);
+               newhandle = (hdl_type == SA_HDL_SHARED) ?
+                   dmu_buf_set_user_ie(db, handle,
+                   NULL, sa_evict) : NULL;
+
+               if (newhandle != NULL) {
+                       kmem_cache_free(sa_cache, handle);
+                       handle = newhandle;
+               }
+       }
+       *handlepp = handle;
+
+       return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+       dmu_buf_t *db;
+       int error;
+
+       if (error = dmu_bonus_hold(objset, objid, NULL, &db))
+               return (error);
+
+       return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+           handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+       return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+       dmu_buf_rele(db, tag);
+}
+
+int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+       ASSERT(hdl);
+       ASSERT(MUTEX_HELD(&hdl->sa_lock));
+       return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+       int error;
+       sa_bulk_attr_t bulk;
+
+       bulk.sa_attr = attr;
+       bulk.sa_data = buf;
+       bulk.sa_length = buflen;
+       bulk.sa_data_func = NULL;
+
+       ASSERT(hdl);
+       mutex_enter(&hdl->sa_lock);
+       error = sa_lookup_impl(hdl, &bulk, 1);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
+{
+       int error;
+       sa_bulk_attr_t bulk;
+
+       bulk.sa_data = NULL;
+       bulk.sa_attr = attr;
+       bulk.sa_data_func = NULL;
+
+       ASSERT(hdl);
+
+       mutex_enter(&hdl->sa_lock);
+       if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL) == 0) {
+               error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+                   uio->uio_resid), UIO_READ, uio);
+       } else {
+               error = ENOENT;
+       }
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+
+}
+#endif
+
+/*
+ * Find an already existing TOC from given os and data
+ * This is a special interface to be used by the ZPL for
+ * finding the uid/gid/gen attributes.
+ */
+void *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
+{
+       sa_idx_tab_t *idx_tab;
+       sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
+       sa_os_t *sa = os->os_sa;
+       sa_lot_t *tb, search;
+       avl_index_t loc;
+
+       /*
+        * Deterimine layout number.  If SA node and header == 0 then
+        * force the index table to the dummy "1" empty layout.
+        *
+        * The layout number would only be zero for a newly created file
+        * that has not added any attributes yet, or with crypto enabled which
+        * doesn't write any attributes to the bonus buffer.
+        */
+
+       search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+       tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+       /* Verify header size is consistent with layout information */
+       ASSERT(tb);
+       ASSERT(IS_SA_BONUSTYPE(bonustype) &&
+           SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
+           (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+       /*
+        * See if any of the already existing TOC entries can be reused?
+        */
+
+       for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+           idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+               boolean_t valid_idx = B_TRUE;
+               int i;
+
+               if (tb->lot_var_sizes != 0 &&
+                   idx_tab->sa_variable_lengths != NULL) {
+                       for (i = 0; i != tb->lot_var_sizes; i++) {
+                               if (hdr->sa_lengths[i] !=
+                                   idx_tab->sa_variable_lengths[i]) {
+                                       valid_idx = B_FALSE;
+                                       break;
+                               }
+                       }
+               }
+               if (valid_idx) {
+                       sa_idx_tab_hold(os, idx_tab);
+                       return (idx_tab);
+               }
+       }
+
+       /* No such luck, create a new entry */
+       idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+       idx_tab->sa_idx_tab =
+           kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+       idx_tab->sa_layout = tb;
+       refcount_create(&idx_tab->sa_refcount);
+       if (tb->lot_var_sizes)
+               idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+                   tb->lot_var_sizes, KM_SLEEP);
+
+       sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+           tb, idx_tab);
+       sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
+       sa_idx_tab_hold(os, idx_tab);   /* one for layout */
+       list_insert_tail(&tb->lot_idx_tab, idx_tab);
+       return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+    boolean_t start, void *userdata)
+{
+       ASSERT(start);
+
+       *dataptr = userdata;
+       *len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+       uint64_t attr_value = 0;
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       sa_attr_table_t *tb = sa->sa_attr_table;
+       int i;
+
+       mutex_enter(&sa->sa_lock);
+
+       if (!sa->sa_need_attr_registration || sa->sa_master_obj == NULL) {
+               mutex_exit(&sa->sa_lock);
+               return;
+       }
+
+       if (sa->sa_reg_attr_obj == NULL) {
+               int error;
+               sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
+                   DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
+               error = zap_add(hdl->sa_os, sa->sa_master_obj,
+                   SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx);
+               ASSERT(error == 0);
+       }
+       for (i = 0; i != sa->sa_num_attrs; i++) {
+               if (sa->sa_attr_table[i].sa_registered)
+                       continue;
+               ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+                   tb[i].sa_byteswap);
+               VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+                   tb[i].sa_name, 8, 1, &attr_value, tx));
+               tb[i].sa_registered = B_TRUE;
+       }
+       sa->sa_need_attr_registration = B_FALSE;
+       mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file.  It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+    int attr_count, dmu_tx_t *tx)
+{
+       sa_os_t *sa = hdl->sa_os->os_sa;
+
+       if (sa->sa_need_attr_registration)
+               sa_attr_register_sync(hdl, tx);
+       return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+    int attr_count, dmu_tx_t *tx)
+{
+       int error;
+
+       mutex_enter(&hdl->sa_lock);
+       error = sa_replace_all_by_template_locked(hdl, attr_desc,
+           attr_count, tx);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+/*
+ * add/remove/replace a single attribute and then rewrite the entire set
+ * of attributes.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+    uint16_t buflen, dmu_tx_t *tx)
+{
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       sa_bulk_attr_t *attr_desc;
+       void *old_data[2];
+       int bonus_attr_count = 0;
+       int bonus_data_size, spill_data_size;
+       int spill_attr_count = 0;
+       int error;
+       uint16_t length;
+       int i, j, k, length_idx;
+       sa_hdr_phys_t *hdr;
+       sa_idx_tab_t *idx_tab;
+       int attr_count;
+       int count;
+
+       ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+       /* First make of copy of the old data */
+
+       if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) {
+               bonus_data_size = hdl->sa_bonus->db_size;
+               old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+               bcopy(hdl->sa_bonus->db_data, old_data[0],
+                   hdl->sa_bonus->db_size);
+               bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+       } else {
+               old_data[0] = NULL;
+       }
+
+       /* Bring spill buffer online if it isn't currently */
+
+       if (sa_has_blkptr(hdl)) {
+               spill_data_size = hdl->sa_spill->db_size;
+               old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+               bcopy(hdl->sa_spill->db_data, old_data[1],
+                   hdl->sa_spill->db_size);
+               spill_attr_count =
+                   hdl->sa_spill_tab->sa_layout->lot_attr_count;
+       } else {
+               old_data[1] = NULL;
+       }
+
+       /* build descriptor of all attributes */
+
+       attr_count = bonus_attr_count + spill_attr_count;
+       if (action == SA_ADD)
+               attr_count++;
+       else if (action == SA_REMOVE)
+               attr_count--;
+
+       attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+       /*
+        * loop through bonus and spill buffer if it exists, and
+        * build up new attr_descriptor to reset the attributes
+        */
+       k = j = 0;
+       count = bonus_attr_count;
+       hdr = SA_GET_HDR(hdl, SA_BONUS);
+       idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+       for (; k != 2; k++) {
+               /* iterate over each attribute in layout */
+               for (i = 0, length_idx = 0; i != count; i++) {
+                       sa_attr_type_t attr;
+
+                       attr = idx_tab->sa_layout->lot_attrs[i];
+                       if (attr == newattr) {
+                               if (action == SA_REMOVE) {
+                                       j++;
+                                       continue;
+                               }
+                               ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
+                               ASSERT(action == SA_REPLACE);
+                               SA_ADD_BULK_ATTR(attr_desc, j, attr,
+                                   locator, datastart, buflen);
+                       } else {
+                               length = SA_REGISTERED_LEN(sa, attr);
+                               if (length == 0) {
+                                       length = hdr->sa_lengths[length_idx++];
+                               }
+
+                               SA_ADD_BULK_ATTR(attr_desc, j, attr,
+                                   NULL, (void *)
+                                   (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+                                   (uintptr_t)old_data[k]), length);
+                       }
+               }
+               if (k == 0 && hdl->sa_spill) {
+                       hdr = SA_GET_HDR(hdl, SA_SPILL);
+                       idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+                       count = spill_attr_count;
+               } else {
+                       break;
+               }
+       }
+       if (action == SA_ADD) {
+               length = SA_REGISTERED_LEN(sa, newattr);
+               if (length == 0) {
+                       length = buflen;
+               }
+               SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+                   datastart, buflen);
+       }
+
+       error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+       if (old_data[0])
+               kmem_free(old_data[0], bonus_data_size);
+       if (old_data[1])
+               kmem_free(old_data[1], spill_data_size);
+       kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+       return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+    dmu_tx_t *tx)
+{
+       int error;
+       sa_os_t *sa = hdl->sa_os->os_sa;
+       dmu_object_type_t bonustype;
+
+       bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+
+       ASSERT(hdl);
+       ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+       /* sync out registration table if necessary */
+       if (sa->sa_need_attr_registration)
+               sa_attr_register_sync(hdl, tx);
+
+       error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+       if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+               sa->sa_update_cb(hdl, tx);
+
+       return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+    void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+       int error;
+       sa_bulk_attr_t bulk;
+
+       bulk.sa_attr = type;
+       bulk.sa_data_func = NULL;
+       bulk.sa_length = buflen;
+       bulk.sa_data = buf;
+
+       mutex_enter(&hdl->sa_lock);
+       error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+int
+sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
+    uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
+{
+       int error;
+       sa_bulk_attr_t bulk;
+
+       bulk.sa_attr = attr;
+       bulk.sa_data = userdata;
+       bulk.sa_data_func = locator;
+       bulk.sa_length = buflen;
+
+       mutex_enter(&hdl->sa_lock);
+       error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+       sa_bulk_attr_t bulk;
+
+       bulk.sa_data = NULL;
+       bulk.sa_attr = attr;
+       bulk.sa_data_func = NULL;
+
+       ASSERT(hdl);
+       mutex_enter(&hdl->sa_lock);
+       if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) {
+               mutex_exit(&hdl->sa_lock);
+               return (ENOENT);
+       }
+       *size = bulk.sa_size;
+
+       mutex_exit(&hdl->sa_lock);
+       return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+       ASSERT(hdl);
+       ASSERT(MUTEX_HELD(&hdl->sa_lock));
+       return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+       int error;
+
+       ASSERT(hdl);
+       mutex_enter(&hdl->sa_lock);
+       error = sa_bulk_lookup_locked(hdl, attrs, count);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+       int error;
+
+       ASSERT(hdl);
+       mutex_enter(&hdl->sa_lock);
+       error = sa_bulk_update_impl(hdl, attrs, count, tx);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+       int error;
+
+       mutex_enter(&hdl->sa_lock);
+       error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+           NULL, 0, tx);
+       mutex_exit(&hdl->sa_lock);
+       return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+       dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+       dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+           blksize, nblocks);
+}
+
+void
+sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
+{
+       (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
+           oldhdl, newhdl, NULL, sa_evict);
+       oldhdl->sa_bonus = NULL;
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+       hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+       return ((dmu_buf_t *)hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+       return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+       ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+       os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+       mutex_enter(&os->os_sa->sa_lock);
+       sa_register_update_callback_locked(os, func);
+       mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+       return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+       return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+       sa_os_t *sa = os->os_sa;
+
+       if (sa->sa_master_obj)
+               return (1);
+
+       sa->sa_master_obj = sa_object;
+
+       return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+       sa_hdr_phys_t *hdr = arg;
+
+       return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+       ASSERT(hdl);
+       mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+       ASSERT(hdl);
+       mutex_exit(&hdl->sa_lock);
+}
index ca7076c..f515be6 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * SHA-256 checksum, as specified in FIPS 180-3, available at:
- * http://csrc.nist.gov/publications/PubsFIPS.html
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
- *
- *     Ch(x, y, z)     (x & y) ^ (~x & z)
- *     Maj(x, y, z)    (x & y) ^ (x & z) ^ (y & z)
- *
- * We use equivalent logical reductions here that require one less op.
- */
-#define        Ch(x, y, z)     ((z) ^ ((x) & ((y) ^ (z))))
-#define        Maj(x, y, z)    (((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define        Rot32(x, s)     (((x) >> s) | ((x) << (32 - s)))
-#define        SIGMA0(x)       (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define        SIGMA1(x)       (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define        sigma0(x)       (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define        sigma1(x)       (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
-       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
-{
-       uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
-       for (t = 0; t < 16; t++, cp += 4)
-               W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
-
-       for (t = 16; t < 64; t++)
-               W[t] = sigma1(W[t - 2]) + W[t - 7] +
-                   sigma0(W[t - 15]) + W[t - 16];
-
-       a = H[0]; b = H[1]; c = H[2]; d = H[3];
-       e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
-       for (t = 0; t < 64; t++) {
-               T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
-               T2 = SIGMA0(a) + Maj(a, b, c);
-               h = g; g = f; f = e; e = d + T1;
-               d = c; c = b; b = a; a = T1 + T2;
-       }
-
-       H[0] += a; H[1] += b; H[2] += c; H[3] += d;
-       H[4] += e; H[5] += f; H[6] += g; H[7] += h;
-}
+#include <sys/sha2.h>
 
 void
 zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
-       uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-           0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-       uint8_t pad[128];
-       int i, padsize;
-
-       for (i = 0; i < (size & ~63ULL); i += 64)
-               SHA256Transform(H, (uint8_t *)buf + i);
-
-       for (padsize = 0; i < size; i++)
-               pad[padsize++] = *((uint8_t *)buf + i);
-
-       for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
-               pad[padsize] = 0;
-
-       for (i = 56; i >= 0; i -= 8)
-               pad[padsize++] = (size << 3) >> i;
-
-       for (i = 0; i < padsize; i += 64)
-               SHA256Transform(H, pad + i);
-
-       ZIO_SET_CHECKSUM(zcp,
-           (uint64_t)H[0] << 32 | H[1],
-           (uint64_t)H[2] << 32 | H[3],
-           (uint64_t)H[4] << 32 | H[5],
-           (uint64_t)H[6] << 32 | H[7]);
+       SHA2_CTX ctx;
+       zio_cksum_t tmp;
+
+       SHA2Init(SHA256, &ctx);
+       SHA2Update(&ctx, buf, size);
+       SHA2Final(&tmp, &ctx);
+
+       /*
+        * A prior implementation of this function had a
+        * private SHA256 implementation always wrote things out in
+        * Big Endian and there wasn't a byteswap variant of it.
+        * To preseve on disk compatibility we need to force that
+        * behaviour.
+        */
+       zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+       zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+       zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+       zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
 }
index d7ed23e..d7c5de0 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
+#include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
-#include <sys/sunddi.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
 
 #ifdef _KERNEL
+#include <sys/bootprops.h>
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/pool.h>
+#include <sys/sysdc.h>
 #include <sys/zone.h>
 #endif /* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
-enum zti_modes {
+typedef enum zti_modes {
        zti_mode_fixed,                 /* value is # of threads (min 1) */
        zti_mode_online_percent,        /* value is % of online CPUs */
-       zti_mode_tune,                  /* fill from zio_taskq_tune_* */
+       zti_mode_batch,                 /* cpu-intensive; value is ignored */
+       zti_mode_null,                  /* don't create a taskq */
        zti_nmodes
-};
+} zti_modes_t;
 
-#define        ZTI_THREAD_FIX(n)       { zti_mode_fixed, (n) }
-#define        ZTI_THREAD_PCT(n)       { zti_mode_online_percent, (n) }
-#define        ZTI_THREAD_TUNE         { zti_mode_tune, 0 }
+#define        ZTI_FIX(n)      { zti_mode_fixed, (n) }
+#define        ZTI_PCT(n)      { zti_mode_online_percent, (n) }
+#define        ZTI_BATCH       { zti_mode_batch, 0 }
+#define        ZTI_NULL        { zti_mode_null, 0 }
 
-#define        ZTI_THREAD_ONE          ZTI_THREAD_FIX(1)
+#define        ZTI_ONE         ZTI_FIX(1)
 
 typedef struct zio_taskq_info {
-       const char *zti_name;
-       struct {
-               enum zti_modes zti_mode;
-               uint_t zti_value;
-       } zti_nthreads[ZIO_TASKQ_TYPES];
+       enum zti_modes zti_mode;
+       uint_t zti_value;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
-                               "issue",                "intr"
+       "issue", "issue_high", "intr", "intr_high"
 };
 
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
-       /*                      ISSUE                   INTR            */
-       { "spa_zio_null",       { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
-       { "spa_zio_read",       { ZTI_THREAD_FIX(8),    ZTI_THREAD_TUNE } },
-       { "spa_zio_write",      { ZTI_THREAD_TUNE,      ZTI_THREAD_FIX(8) } },
-       { "spa_zio_free",       { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
-       { "spa_zio_claim",      { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
-       { "spa_zio_ioctl",      { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
+/*
+ * Define the taskq threads for the following I/O types:
+ *     NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+       /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
+       { ZTI_FIX(8),   ZTI_NULL,       ZTI_BATCH,      ZTI_NULL },
+       { ZTI_BATCH,    ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
+       { ZTI_FIX(100), ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 };
 
-enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
-uint_t zio_taskq_tune_value = 80;      /* #threads = 80% of # online CPUs */
-
-static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static dsl_syncfunc_t spa_sync_props;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
+    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+    char **ereport);
+
+uint_t         zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
+id_t           zio_taskq_psrset_bind = PS_NONE;
+boolean_t      zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
+uint_t         zio_taskq_basedc = 80;          /* base duty cycle */
+
+boolean_t      spa_create_process = B_TRUE;    /* no process ==> no sysdc */
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define        TRYIMPORT_NAME  "$import"
 
 /*
  * ==========================================================================
@@ -144,7 +165,7 @@ static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
        uint64_t size;
-       uint64_t used;
+       uint64_t alloc;
        uint64_t cap, version;
        zprop_source_t src = ZPROP_SRC_NONE;
        spa_config_dirent_t *dp;
@@ -152,17 +173,20 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
        ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
        if (spa->spa_root_vdev != NULL) {
-               size = spa_get_space(spa);
-               used = spa_get_alloc(spa);
+               alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+               size = metaslab_class_get_space(spa_normal_class(spa));
                spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
-               spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
-               spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
-                   size - used, src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+                   size - alloc, src);
 
-               cap = (size == 0) ? 0 : (used * 100 / size);
+               cap = (size == 0) ? 0 : (alloc * 100 / size);
                spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
+               spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+                   ddt_get_pool_dedup_ratio(spa), src);
+
                spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
                    spa->spa_root_vdev->vdev_state, src);
 
@@ -197,9 +221,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
+       objset_t *mos = spa->spa_meta_objset;
        zap_cursor_t zc;
        zap_attribute_t za;
-       objset_t *mos = spa->spa_meta_objset;
        int err;
 
        VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -212,7 +236,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
        spa_prop_get_config(spa, nvp);
 
        /* If no pool property object, no more prop to get. */
-       if (spa->spa_pool_props_object == 0) {
+       if (mos == NULL || spa->spa_pool_props_object == 0) {
                mutex_exit(&spa->spa_props_lock);
                return (0);
        }
@@ -371,12 +395,14 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                        break;
                                }
 
-                               if (error = dmu_objset_open(strval, DMU_OST_ZFS,
-                                   DS_MODE_USER | DS_MODE_READONLY, &os))
+                               if (error = dmu_objset_hold(strval, FTAG, &os))
                                        break;
 
-                               /* We don't support gzip bootable datasets */
-                               if ((error = dsl_prop_get_integer(strval,
+                               /* Must be ZPL and not gzip compressed. */
+
+                               if (dmu_objset_type(os) != DMU_OST_ZFS) {
+                                       error = ENOTSUP;
+                               } else if ((error = dsl_prop_get_integer(strval,
                                    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
                                    &compress, NULL)) == 0 &&
                                    !BOOTFS_COMPRESS_VALID(compress)) {
@@ -384,7 +410,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                } else {
                                        objnum = dmu_objset_id(os);
                                }
-                               dmu_objset_close(os);
+                               dmu_objset_rele(os, FTAG);
                        }
                        break;
 
@@ -432,6 +458,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                            strcmp(slash, "/..") == 0)
                                error = EINVAL;
                        break;
+
+               case ZPOOL_PROP_DEDUPDITTO:
+                       if (spa_version(spa) < SPA_VERSION_DEDUP)
+                               error = ENOTSUP;
+                       else
+                               error = nvpair_value_uint64(elem, &intval);
+                       if (error == 0 &&
+                           intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+                               error = EINVAL;
+                       break;
                }
 
                if (error)
@@ -565,63 +601,185 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
            offsetof(spa_error_entry_t, se_avl));
 }
 
-/*
- * Activate an uninitialized pool.
- */
-static void
-spa_activate(spa_t *spa, int mode)
+static taskq_t *
+spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+    uint_t value)
 {
-       ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+       uint_t flags = TASKQ_PREPOPULATE;
+       boolean_t batch = B_FALSE;
 
-       spa->spa_state = POOL_STATE_ACTIVE;
-       spa->spa_mode = mode;
+       switch (mode) {
+       case zti_mode_null:
+               return (NULL);          /* no taskq needed */
+
+       case zti_mode_fixed:
+               ASSERT3U(value, >=, 1);
+               value = MAX(value, 1);
+               break;
+
+       case zti_mode_batch:
+               batch = B_TRUE;
+               flags |= TASKQ_THREADS_CPU_PCT;
+               value = zio_taskq_batch_pct;
+               break;
+
+       case zti_mode_online_percent:
+               flags |= TASKQ_THREADS_CPU_PCT;
+               break;
+
+       default:
+               panic("unrecognized mode for %s taskq (%u:%u) in "
+                   "spa_activate()",
+                   name, mode, value);
+               break;
+       }
 
-       spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
-       spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+       if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+               if (batch)
+                       flags |= TASKQ_DC_BATCH;
 
+               return (taskq_create_sysdc(name, value, 50, INT_MAX,
+                   spa->spa_proc, zio_taskq_basedc, flags));
+       }
+       return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+           spa->spa_proc, flags));
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
        for (int t = 0; t < ZIO_TYPES; t++) {
-               const zio_taskq_info_t *ztip = &zio_taskqs[t];
                for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-                       enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
-                       uint_t value = ztip->zti_nthreads[q].zti_value;
+                       const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+                       enum zti_modes mode = ztip->zti_mode;
+                       uint_t value = ztip->zti_value;
                        char name[32];
 
                        (void) snprintf(name, sizeof (name),
-                           "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+                           "%s_%s", zio_type_name[t], zio_taskq_types[q]);
 
-                       if (mode == zti_mode_tune) {
-                               mode = zio_taskq_tune_mode;
-                               value = zio_taskq_tune_value;
-                               if (mode == zti_mode_tune)
-                                       mode = zti_mode_online_percent;
-                       }
+                       spa->spa_zio_taskq[t][q] =
+                           spa_taskq_create(spa, name, mode, value);
+               }
+       }
+}
 
-                       switch (mode) {
-                       case zti_mode_fixed:
-                               ASSERT3U(value, >=, 1);
-                               value = MAX(value, 1);
+#ifdef _KERNEL
+static void
+spa_thread(void *arg)
+{
+       callb_cpr_t cprinfo;
 
-                               spa->spa_zio_taskq[t][q] = taskq_create(name,
-                                   value, maxclsyspri, 50, INT_MAX,
-                                   TASKQ_PREPOPULATE);
-                               break;
+       spa_t *spa = arg;
+       user_t *pu = PTOU(curproc);
 
-                       case zti_mode_online_percent:
-                               spa->spa_zio_taskq[t][q] = taskq_create(name,
-                                   value, maxclsyspri, 50, INT_MAX,
-                                   TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
-                               break;
+       CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+           spa->spa_name);
 
-                       case zti_mode_tune:
-                       default:
-                               panic("unrecognized mode for "
-                                   "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
-                                   "in spa_activate()",
-                                   t, q, mode, value);
-                               break;
+       ASSERT(curproc != &p0);
+       (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+           "zpool-%s", spa->spa_name);
+       (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+       /* bind this thread to the requested psrset */
+       if (zio_taskq_psrset_bind != PS_NONE) {
+               pool_lock();
+               mutex_enter(&cpu_lock);
+               mutex_enter(&pidlock);
+               mutex_enter(&curproc->p_lock);
+
+               if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+                   0, NULL, NULL) == 0)  {
+                       curthread->t_bind_pset = zio_taskq_psrset_bind;
+               } else {
+                       cmn_err(CE_WARN,
+                           "Couldn't bind process for zfs pool \"%s\" to "
+                           "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+               }
+
+               mutex_exit(&curproc->p_lock);
+               mutex_exit(&pidlock);
+               mutex_exit(&cpu_lock);
+               pool_unlock();
+       }
+
+       if (zio_taskq_sysdc) {
+               sysdc_thread_enter(curthread, 100, 0);
+       }
+
+       spa->spa_proc = curproc;
+       spa->spa_did = curthread->t_did;
+
+       spa_create_zio_taskqs(spa);
+
+       mutex_enter(&spa->spa_proc_lock);
+       ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+       spa->spa_proc_state = SPA_PROC_ACTIVE;
+       cv_broadcast(&spa->spa_proc_cv);
+
+       CALLB_CPR_SAFE_BEGIN(&cprinfo);
+       while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+               cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+       CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+       ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+       spa->spa_proc_state = SPA_PROC_GONE;
+       spa->spa_proc = &p0;
+       cv_broadcast(&spa->spa_proc_cv);
+       CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
+
+       mutex_enter(&curproc->p_lock);
+       lwp_exit();
+}
+#endif
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa, int mode)
+{
+       ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+       spa->spa_state = POOL_STATE_ACTIVE;
+       spa->spa_mode = mode;
+
+       spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+       spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+       /* Try to create a covering process */
+       mutex_enter(&spa->spa_proc_lock);
+       ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+       ASSERT(spa->spa_proc == &p0);
+       spa->spa_did = 0;
+
+       /* Only create a process if we're going to be around a while. */
+       if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+               if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+                   NULL, 0) == 0) {
+                       spa->spa_proc_state = SPA_PROC_CREATED;
+                       while (spa->spa_proc_state == SPA_PROC_CREATED) {
+                               cv_wait(&spa->spa_proc_cv,
+                                   &spa->spa_proc_lock);
                        }
+                       ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+                       ASSERT(spa->spa_proc != &p0);
+                       ASSERT(spa->spa_did != 0);
+               } else {
+#ifdef _KERNEL
+                       cmn_err(CE_WARN,
+                           "Couldn't create process for zfs pool \"%s\"\n",
+                           spa->spa_name);
+#endif
                }
        }
+       mutex_exit(&spa->spa_proc_lock);
+
+       /* If we didn't create a process, we need to create our taskqs. */
+       if (spa->spa_proc == &p0) {
+               spa_create_zio_taskqs(spa);
+       }
 
        list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
            offsetof(vdev_t, vdev_config_dirty_node));
@@ -658,7 +816,8 @@ spa_deactivate(spa_t *spa)
 
        for (int t = 0; t < ZIO_TYPES; t++) {
                for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-                       taskq_destroy(spa->spa_zio_taskq[t][q]);
+                       if (spa->spa_zio_taskq[t][q] != NULL)
+                               taskq_destroy(spa->spa_zio_taskq[t][q]);
                        spa->spa_zio_taskq[t][q] = NULL;
                }
        }
@@ -679,6 +838,31 @@ spa_deactivate(spa_t *spa)
        avl_destroy(&spa->spa_errlist_last);
 
        spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+       mutex_enter(&spa->spa_proc_lock);
+       if (spa->spa_proc_state != SPA_PROC_NONE) {
+               ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+               spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+               cv_broadcast(&spa->spa_proc_cv);
+               while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+                       ASSERT(spa->spa_proc != &p0);
+                       cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+               }
+               ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+               spa->spa_proc_state = SPA_PROC_NONE;
+       }
+       ASSERT(spa->spa_proc == &p0);
+       mutex_exit(&spa->spa_proc_lock);
+
+       /*
+        * We want to make sure spa_thread() has actually exited the ZFS
+        * module, so that the module can't be unloaded out from underneath
+        * it.
+        */
+       if (spa->spa_did != 0) {
+               thread_join(spa->spa_did);
+               spa->spa_did = 0;
+       }
 }
 
 /*
@@ -759,14 +943,19 @@ spa_unload(spa_t *spa)
                spa->spa_async_zio_root = NULL;
        }
 
+       bpobj_close(&spa->spa_deferred_bpobj);
+
        /*
         * Close the dsl pool.
         */
        if (spa->spa_dsl_pool) {
                dsl_pool_close(spa->spa_dsl_pool);
                spa->spa_dsl_pool = NULL;
+               spa->spa_meta_objset = NULL;
        }
 
+       ddt_unload(spa);
+
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
        /*
@@ -919,7 +1108,7 @@ spa_load_spares(spa_t *spa)
            KM_SLEEP);
        for (i = 0; i < spa->spa_spares.sav_count; i++)
                spares[i] = vdev_config_generate(spa,
-                   spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+                   spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
        VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
            ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
        for (i = 0; i < spa->spa_spares.sav_count; i++)
@@ -1045,7 +1234,7 @@ spa_load_l2cache(spa_t *spa)
        l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
        for (i = 0; i < sav->sav_count; i++)
                l2cache[i] = vdev_config_generate(spa,
-                   sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+                   sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
        VERIFY(nvlist_add_nvlist_array(sav->sav_config,
            ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
@@ -1099,26 +1288,23 @@ spa_check_removed(vdev_t *vd)
  * that the label does not contain the most up-to-date information.
  */
 void
-spa_load_log_state(spa_t *spa)
+spa_load_log_state(spa_t *spa, nvlist_t *nv)
 {
-       nvlist_t *nv, *nvroot, **child;
-       uint64_t is_log;
-       uint_t children;
-       vdev_t *rvd = spa->spa_root_vdev;
-
-       VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
-       VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-       VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-           &child, &children) == 0);
+       vdev_t *ovd, *rvd = spa->spa_root_vdev;
 
-       for (int c = 0; c < children; c++) {
-               vdev_t *tvd = rvd->vdev_child[c];
+       /*
+        * Load the original root vdev tree from the passed config.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
-               if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-                   &is_log) == 0 && is_log)
-                       vdev_load_log_state(tvd, child[c]);
+       for (int c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *cvd = rvd->vdev_child[c];
+               if (cvd->vdev_islog)
+                       vdev_load_log_state(cvd, ovd->vdev_child[c]);
        }
-       nvlist_free(nv);
+       vdev_free(ovd);
+       spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -1133,7 +1319,7 @@ spa_check_logs(spa_t *spa)
        case SPA_LOG_UNKNOWN:
                if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
                    DS_FIND_CHILDREN)) {
-                       spa->spa_log_state = SPA_LOG_MISSING;
+                       spa_set_log_state(spa, SPA_LOG_MISSING);
                        return (1);
                }
                break;
@@ -1141,186 +1327,486 @@ spa_check_logs(spa_t *spa)
        return (0);
 }
 
-/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
- */
-static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+static boolean_t
+spa_passivate_log(spa_t *spa)
 {
-       int error = 0;
-       nvlist_t *nvroot = NULL;
-       vdev_t *rvd;
-       uberblock_t *ub = &spa->spa_uberblock;
-       uint64_t config_cache_txg = spa->spa_config_txg;
-       uint64_t pool_guid;
-       uint64_t version;
-       uint64_t autoreplace = 0;
-       int orig_mode = spa->spa_mode;
-       char *ereport = FM_EREPORT_ZFS_POOL;
+       vdev_t *rvd = spa->spa_root_vdev;
+       boolean_t slog_found = B_FALSE;
 
-       /*
-        * If this is an untrusted config, access the pool in read-only mode.
-        * This prevents things like resilvering recently removed devices.
-        */
-       if (!mosconfig)
-               spa->spa_mode = FREAD;
+       ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
-       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (!spa_has_slogs(spa))
+               return (B_FALSE);
 
-       spa->spa_load_state = state;
+       for (int c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+               metaslab_group_t *mg = tvd->vdev_mg;
 
-       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-           nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
-               error = EINVAL;
-               goto out;
+               if (tvd->vdev_islog) {
+                       metaslab_group_passivate(mg);
+                       slog_found = B_TRUE;
+               }
        }
 
-       /*
-        * Versioning wasn't explicitly added to the label until later, so if
-        * it's not present treat it as the initial version.
-        */
-       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
-               version = SPA_VERSION_INITIAL;
-
-       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-           &spa->spa_config_txg);
+       return (slog_found);
+}
 
-       if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
-           spa_guid_exists(pool_guid, 0)) {
-               error = EEXIST;
-               goto out;
-       }
+static void
+spa_activate_log(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
 
-       spa->spa_load_guid = pool_guid;
+       ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
-       /*
-        * Create "The Godfather" zio to hold all async IOs
-        */
-       spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+       for (int c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+               metaslab_group_t *mg = tvd->vdev_mg;
 
-       /*
-        * Parse the configuration into a vdev tree.  We explicitly set the
-        * value that will be returned by spa_version() since parsing the
-        * configuration requires knowing the version number.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       spa->spa_ubsync.ub_version = version;
-       error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-       spa_config_exit(spa, SCL_ALL, FTAG);
+               if (tvd->vdev_islog)
+                       metaslab_group_activate(mg);
+       }
+}
 
-       if (error != 0)
-               goto out;
+int
+spa_offline_log(spa_t *spa)
+{
+       int error = 0;
 
-       ASSERT(spa->spa_root_vdev == rvd);
-       ASSERT(spa_guid(spa) == pool_guid);
+       if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+           NULL, DS_FIND_CHILDREN)) == 0) {
 
-       /*
-        * Try to open all vdevs, loading each label in the process.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = vdev_open(rvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-       if (error != 0)
-               goto out;
+               /*
+                * We successfully offlined the log device, sync out the
+                * current txg so that the "stubby" block can be removed
+                * by zil_sync().
+                */
+               txg_wait_synced(spa->spa_dsl_pool, 0);
+       }
+       return (error);
+}
 
-       /*
-        * We need to validate the vdev labels against the configuration that
-        * we have in hand, which is dependent on the setting of mosconfig. If
-        * mosconfig is true then we're validating the vdev labels based on
-        * that config. Otherwise, we're validating against the cached config
-        * (zpool.cache) that was read when we loaded the zfs module, and then
-        * later we will recursively call spa_load() and validate against
-        * the vdev config.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = vdev_validate(rvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-       if (error != 0)
-               goto out;
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+       for (int i = 0; i < sav->sav_count; i++)
+               spa_check_removed(sav->sav_vdevs[i]);
+}
 
-       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-               error = ENXIO;
-               goto out;
-       }
+void
+spa_claim_notify(zio_t *zio)
+{
+       spa_t *spa = zio->io_spa;
 
-       /*
-        * Find the best uberblock.
-        */
-       vdev_uberblock_load(NULL, rvd, ub);
+       if (zio->io_error)
+               return;
 
-       /*
-        * If we weren't able to find a single valid uberblock, return failure.
-        */
-       if (ub->ub_txg == 0) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = ENXIO;
-               goto out;
-       }
+       mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
+       if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+               spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+       mutex_exit(&spa->spa_props_lock);
+}
 
-       /*
-        * If the pool is newer than the code, we can't open it.
-        */
-       if (ub->ub_version > SPA_VERSION) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_VERSION_NEWER);
-               error = ENOTSUP;
-               goto out;
-       }
+typedef struct spa_load_error {
+       uint64_t        sle_meta_count;
+       uint64_t        sle_data_count;
+} spa_load_error_t;
 
-       /*
-        * If the vdev guid sum doesn't match the uberblock, we have an
-        * incomplete configuration.
-        */
-       if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_BAD_GUID_SUM);
-               error = ENXIO;
-               goto out;
-       }
+static void
+spa_load_verify_done(zio_t *zio)
+{
+       blkptr_t *bp = zio->io_bp;
+       spa_load_error_t *sle = zio->io_private;
+       dmu_object_type_t type = BP_GET_TYPE(bp);
+       int error = zio->io_error;
 
-       /*
-        * Initialize internal SPA structures.
-        */
-       spa->spa_state = POOL_STATE_ACTIVE;
-       spa->spa_ubsync = spa->spa_uberblock;
-       spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
-       error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
        if (error) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               goto out;
+               if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+                   type != DMU_OT_INTENT_LOG)
+                       atomic_add_64(&sle->sle_meta_count, 1);
+               else
+                       atomic_add_64(&sle->sle_data_count, 1);
        }
-       spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+       zio_data_buf_free(zio->io_data, zio->io_size);
+}
 
-       if (zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-           sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+       if (bp != NULL) {
+               zio_t *rio = arg;
+               size_t size = BP_GET_PSIZE(bp);
+               void *data = zio_data_buf_alloc(size);
+
+               zio_nowait(zio_read(rio, spa, bp, data, size,
+                   spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+                   ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+                   ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
        }
+       return (0);
+}
 
-       if (!mosconfig) {
-               nvlist_t *newconfig;
-               uint64_t hostid;
+static int
+spa_load_verify(spa_t *spa)
+{
+       zio_t *rio;
+       spa_load_error_t sle = { 0 };
+       zpool_rewind_policy_t policy;
+       boolean_t verify_ok = B_FALSE;
+       int error;
 
-               if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
-                       vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                           VDEV_AUX_CORRUPT_DATA);
-                       error = EIO;
-                       goto out;
-               }
+       zpool_get_rewind_policy(spa->spa_config, &policy);
 
-               if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+       if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+               return (0);
+
+       rio = zio_root(spa, NULL, &sle,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+       error = traverse_pool(spa, spa->spa_verify_min_txg,
+           TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+
+       (void) zio_wait(rio);
+
+       spa->spa_load_meta_errors = sle.sle_meta_count;
+       spa->spa_load_data_errors = sle.sle_data_count;
+
+       if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+           sle.sle_data_count <= policy.zrp_maxdata) {
+               verify_ok = B_TRUE;
+               spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+               spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+       } else {
+               spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+       }
+
+       if (error) {
+               if (error != ENXIO && error != EIO)
+                       error = EIO;
+               return (error);
+       }
+
+       return (verify_ok ? 0 : EIO);
+}
+
+/*
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+       (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+           zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
+ */
+static int
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+{
+       return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           name, sizeof (uint64_t), 1, val));
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+       vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+       return (err);
+}
+
+/*
+ * Fix up config after a partly-completed split.  This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process.  To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call.  If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+       uint_t extracted;
+       uint64_t *glist;
+       uint_t i, gcount;
+       nvlist_t *nvl;
+       vdev_t **vd;
+       boolean_t attempt_reopen;
+
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+               return;
+
+       /* check that the config is complete */
+       if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+           &glist, &gcount) != 0)
+               return;
+
+       vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+       /* attempt to online all the vdevs & validate */
+       attempt_reopen = B_TRUE;
+       for (i = 0; i < gcount; i++) {
+               if (glist[i] == 0)      /* vdev is hole */
+                       continue;
+
+               vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+               if (vd[i] == NULL) {
+                       /*
+                        * Don't bother attempting to reopen the disks;
+                        * just do the split.
+                        */
+                       attempt_reopen = B_FALSE;
+               } else {
+                       /* attempt to re-online it */
+                       vd[i]->vdev_offline = B_FALSE;
+               }
+       }
+
+       if (attempt_reopen) {
+               vdev_reopen(spa->spa_root_vdev);
+
+               /* check each device to see what state it's in */
+               for (extracted = 0, i = 0; i < gcount; i++) {
+                       if (vd[i] != NULL &&
+                           vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+                               break;
+                       ++extracted;
+               }
+       }
+
+       /*
+        * If every disk has been moved to the new pool, or if we never
+        * even attempted to look at them, then we split them off for
+        * good.
+        */
+       if (!attempt_reopen || gcount == extracted) {
+               for (i = 0; i < gcount; i++)
+                       if (vd[i] != NULL)
+                               vdev_split(vd[i]);
+               vdev_reopen(spa->spa_root_vdev);
+       }
+
+       kmem_free(vd, gcount * sizeof (vdev_t *));
+}
+
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+    boolean_t mosconfig)
+{
+       nvlist_t *config = spa->spa_config;
+       char *ereport = FM_EREPORT_ZFS_POOL;
+       int error;
+       uint64_t pool_guid;
+       nvlist_t *nvl;
+
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+               return (EINVAL);
+
+       /*
+        * Versioning wasn't explicitly added to the label until later, so if
+        * it's not present treat it as the initial version.
+        */
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+           &spa->spa_ubsync.ub_version) != 0)
+               spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+           &spa->spa_config_txg);
+
+       if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+           spa_guid_exists(pool_guid, 0)) {
+               error = EEXIST;
+       } else {
+               spa->spa_load_guid = pool_guid;
+
+               if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+                   &nvl) == 0) {
+                       VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+                           KM_SLEEP) == 0);
+               }
+
+               error = spa_load_impl(spa, pool_guid, config, state, type,
+                   mosconfig, &ereport);
+       }
+
+       spa->spa_minref = refcount_count(&spa->spa_refcount);
+       if (error && error != EBADF)
+               zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+       spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+       spa->spa_ena = 0;
+
+       return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+    char **ereport)
+{
+       int error = 0;
+       nvlist_t *nvroot = NULL;
+       vdev_t *rvd;
+       uberblock_t *ub = &spa->spa_uberblock;
+       uint64_t config_cache_txg = spa->spa_config_txg;
+       int orig_mode = spa->spa_mode;
+       int parse;
+       uint64_t obj;
+
+       /*
+        * If this is an untrusted config, access the pool in read-only mode.
+        * This prevents things like resilvering recently removed devices.
+        */
+       if (!mosconfig)
+               spa->spa_mode = FREAD;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       spa->spa_load_state = state;
+
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+               return (EINVAL);
+
+       parse = (type == SPA_IMPORT_EXISTING ?
+           VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+
+       /*
+        * Create "The Godfather" zio to hold all async IOs
+        */
+       spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+       /*
+        * Parse the configuration into a vdev tree.  We explicitly set the
+        * value that will be returned by spa_version() since parsing the
+        * configuration requires knowing the version number.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+
+       if (error != 0)
+               return (error);
+
+       ASSERT(spa->spa_root_vdev == rvd);
+
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               ASSERT(spa_guid(spa) == pool_guid);
+       }
+
+       /*
+        * Try to open all vdevs, loading each label in the process.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       error = vdev_open(rvd);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+       if (error != 0)
+               return (error);
+
+       /*
+        * We need to validate the vdev labels against the configuration that
+        * we have in hand, which is dependent on the setting of mosconfig. If
+        * mosconfig is true then we're validating the vdev labels based on
+        * that config.  Otherwise, we're validating against the cached config
+        * (zpool.cache) that was read when we loaded the zfs module, and then
+        * later we will recursively call spa_load() and validate against
+        * the vdev config.
+        *
+        * If we're assembling a new pool that's been split off from an
+        * existing pool, the labels haven't yet been updated so we skip
+        * validation for now.
+        */
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               error = vdev_validate(rvd);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+
+               if (error != 0)
+                       return (error);
+
+               if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+                       return (ENXIO);
+       }
+
+       /*
+        * Find the best uberblock.
+        */
+       vdev_uberblock_load(NULL, rvd, ub);
+
+       /*
+        * If we weren't able to find a single valid uberblock, return failure.
+        */
+       if (ub->ub_txg == 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+
+       /*
+        * If the pool is newer than the code, we can't open it.
+        */
+       if (ub->ub_version > SPA_VERSION)
+               return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+
+       /*
+        * If the vdev guid sum doesn't match the uberblock, we have an
+        * incomplete configuration.
+        */
+       if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+           rvd->vdev_guid_sum != ub->ub_guid_sum)
+               return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+       if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               spa_try_repair(spa, config);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+               nvlist_free(spa->spa_config_splitting);
+               spa->spa_config_splitting = NULL;
+       }
+
+       /*
+        * Initialize internal SPA structures.
+        */
+       spa->spa_state = POOL_STATE_ACTIVE;
+       spa->spa_ubsync = spa->spa_uberblock;
+       spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+           TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+       spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+           spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+       spa->spa_claim_max_txg = spa->spa_first_txg;
+       spa->spa_prev_software_version = ub->ub_software_version;
+
+       error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+       if (error)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+       if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+       if (!mosconfig) {
+               uint64_t hostid;
+               nvlist_t *policy = NULL, *nvconfig;
+
+               if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+               if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
                    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
                        char *hostname;
                        unsigned long myhostid = 0;
 
-                       VERIFY(nvlist_lookup_string(newconfig,
+                       VERIFY(nvlist_lookup_string(nvconfig,
                            ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 #ifdef _KERNEL
@@ -1334,185 +1820,133 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 #endif /* _KERNEL */
                        if (hostid != 0 && myhostid != 0 &&
                            hostid != myhostid) {
+                               nvlist_free(nvconfig);
                                cmn_err(CE_WARN, "pool '%s' could not be "
                                    "loaded as it was last accessed by "
                                    "another system (host: %s hostid: 0x%lx). "
                                    "See: http://www.sun.com/msg/ZFS-8000-EY",
                                    spa_name(spa), hostname,
                                    (unsigned long)hostid);
-                               error = EBADF;
-                               goto out;
+                               return (EBADF);
                        }
                }
+               if (nvlist_lookup_nvlist(spa->spa_config,
+                   ZPOOL_REWIND_POLICY, &policy) == 0)
+                       VERIFY(nvlist_add_nvlist(nvconfig,
+                           ZPOOL_REWIND_POLICY, policy) == 0);
 
-               spa_config_set(spa, newconfig);
+               spa_config_set(spa, nvconfig);
                spa_unload(spa);
                spa_deactivate(spa);
                spa_activate(spa, orig_mode);
 
-               return (spa_load(spa, newconfig, state, B_TRUE));
+               return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
        }
 
-       if (zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-           sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+       if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+       if (error != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        /*
         * Load the bit that tells us to use the new accounting function
         * (raid-z deflation).  If we have an older pool, this will not
         * be present.
         */
-       error = zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-           sizeof (uint64_t), 1, &spa->spa_deflate);
-       if (error != 0 && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+       error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+       error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+           &spa->spa_creation_version);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        /*
         * Load the persistent error log.  If we have an older pool, this will
         * not be present.
         */
-       error = zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
-           sizeof (uint64_t), 1, &spa->spa_errlog_last);
-       if (error != 0 && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+       error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
-       error = zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
-           sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
-       if (error != 0 && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+       error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+           &spa->spa_errlog_scrub);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        /*
         * Load the history object.  If we have an older pool, this
         * will not be present.
         */
-       error = zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
-           sizeof (uint64_t), 1, &spa->spa_history);
-       if (error != 0 && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+       error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+       /*
+        * If we're assembling the pool from the split-off vdevs of
+        * an existing pool, we don't want to attach the spares & cache
+        * devices.
+        */
 
        /*
         * Load any hot spares for this pool.
         */
-       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
-       if (error != 0 && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
-       if (error == 0) {
+       error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
                ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
                if (load_nvlist(spa, spa->spa_spares.sav_object,
-                   &spa->spa_spares.sav_config) != 0) {
-                       vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                           VDEV_AUX_CORRUPT_DATA);
-                       error = EIO;
-                       goto out;
-               }
+                   &spa->spa_spares.sav_config) != 0)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
                spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
                spa_load_spares(spa);
                spa_config_exit(spa, SCL_ALL, FTAG);
+       } else if (error == 0) {
+               spa->spa_spares.sav_sync = B_TRUE;
        }
 
        /*
         * Load any level 2 ARC devices for this pool.
         */
-       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+       error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
            &spa->spa_l2cache.sav_object);
-       if (error != 0 && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
-       if (error == 0) {
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
                ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
                if (load_nvlist(spa, spa->spa_l2cache.sav_object,
-                   &spa->spa_l2cache.sav_config) != 0) {
-                       vdev_set_state(rvd, B_TRUE,
-                           VDEV_STATE_CANT_OPEN,
-                           VDEV_AUX_CORRUPT_DATA);
-                       error = EIO;
-                       goto out;
-               }
+                   &spa->spa_l2cache.sav_config) != 0)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
                spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
                spa_load_l2cache(spa);
                spa_config_exit(spa, SCL_ALL, FTAG);
+       } else if (error == 0) {
+               spa->spa_l2cache.sav_sync = B_TRUE;
        }
 
-       spa_load_log_state(spa);
-
-       if (spa_check_logs(spa)) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_BAD_LOG);
-               error = ENXIO;
-               ereport = FM_EREPORT_ZFS_LOG_REPLAY;
-               goto out;
-       }
-
-
        spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
-       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
-
-       if (error && error != ENOENT) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+       error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
+       if (error && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        if (error == 0) {
-               (void) zap_lookup(spa->spa_meta_objset,
-                   spa->spa_pool_props_object,
-                   zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
-                   sizeof (uint64_t), 1, &spa->spa_bootfs);
-               (void) zap_lookup(spa->spa_meta_objset,
-                   spa->spa_pool_props_object,
-                   zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
-                   sizeof (uint64_t), 1, &autoreplace);
-               (void) zap_lookup(spa->spa_meta_objset,
-                   spa->spa_pool_props_object,
-                   zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
-                   sizeof (uint64_t), 1, &spa->spa_delegation);
-               (void) zap_lookup(spa->spa_meta_objset,
-                   spa->spa_pool_props_object,
-                   zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
-                   sizeof (uint64_t), 1, &spa->spa_failmode);
-               (void) zap_lookup(spa->spa_meta_objset,
-                   spa->spa_pool_props_object,
-                   zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND),
-                   sizeof (uint64_t), 1, &spa->spa_autoexpand);
+               uint64_t autoreplace;
+
+               spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+               spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+               spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+               spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+               spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+               spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
+                   &spa->spa_dedup_ditto);
+
+               spa->spa_autoreplace = (autoreplace != 0);
        }
 
        /*
@@ -1522,8 +1956,18 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
         * unopenable vdevs so that the normal autoreplace handler can take
         * over.
         */
-       if (autoreplace && state != SPA_LOAD_TRYIMPORT)
+       if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
                spa_check_removed(spa->spa_root_vdev);
+               /*
+                * For the import case, this is done in spa_import(), because
+                * at this point we're using the spare definitions from
+                * the MOS config, not necessarily from the userland config.
+                */
+               if (state != SPA_LOAD_IMPORT) {
+                       spa_aux_check_removed(&spa->spa_spares);
+                       spa_aux_check_removed(&spa->spa_l2cache);
+               }
+       }
 
        /*
         * Load the vdev state for all toplevel vdevs.
@@ -1541,12 +1985,48 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
         * Check the state of the root vdev.  If it can't be opened, it
         * indicates one or more toplevel vdevs are faulted.
         */
-       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-               error = ENXIO;
-               goto out;
+       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+               return (ENXIO);
+
+       /*
+        * Load the DDTs (dedup tables).
+        */
+       error = ddt_load(spa);
+       if (error != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+       spa_update_dspace(spa);
+
+       if (state != SPA_LOAD_TRYIMPORT) {
+               error = spa_load_verify(spa);
+               if (error)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+                           error));
        }
 
-       if (spa_writeable(spa)) {
+       /*
+        * Load the intent log state and check log integrity.  If we're
+        * assembling a pool from a split, the log is not transferred over.
+        */
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               nvlist_t *nvconfig;
+
+               if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+               VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+                   &nvroot) == 0);
+               spa_load_log_state(spa, nvroot);
+               nvlist_free(nvconfig);
+
+               if (spa_check_logs(spa)) {
+                       *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+                       return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+               }
+       }
+
+       if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+           spa->spa_load_max_txg == UINT64_MAX)) {
                dmu_tx_t *tx;
                int need_update = B_FALSE;
 
@@ -1555,60 +2035,154 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
                /*
                 * Claim log blocks that haven't been committed yet.
                 * This must all happen in a single txg.
+                * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+                * invoked from zil_claim_log_block()'s i/o done callback.
+                * Price of rollback is that we abandon the log.
                 */
+               spa->spa_claiming = B_TRUE;
+
                tx = dmu_tx_create_assigned(spa_get_dsl(spa),
                    spa_first_txg(spa));
                (void) dmu_objset_find(spa_name(spa),
                    zil_claim, tx, DS_FIND_CHILDREN);
                dmu_tx_commit(tx);
 
-               spa->spa_log_state = SPA_LOG_GOOD;
-               spa->spa_sync_on = B_TRUE;
-               txg_sync_start(spa->spa_dsl_pool);
+               spa->spa_claiming = B_FALSE;
+
+               spa_set_log_state(spa, SPA_LOG_GOOD);
+               spa->spa_sync_on = B_TRUE;
+               txg_sync_start(spa->spa_dsl_pool);
+
+               /*
+                * Wait for all claims to sync.  We sync up to the highest
+                * claimed log block birth time so that claimed log blocks
+                * don't appear to be from the future.  spa_claim_max_txg
+                * will have been set for us by either zil_check_log_chain()
+                * (invoked from spa_check_logs()) or zil_claim() above.
+                */
+               txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+
+               /*
+                * If the config cache is stale, or we have uninitialized
+                * metaslabs (see spa_vdev_add()), then update the config.
+                *
+                * If spa_load_verbatim is true, trust the current
+                * in-core spa_config and update the disk labels.
+                */
+               if (config_cache_txg != spa->spa_config_txg ||
+                   state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+                   state == SPA_LOAD_RECOVER)
+                       need_update = B_TRUE;
+
+               for (int c = 0; c < rvd->vdev_children; c++)
+                       if (rvd->vdev_child[c]->vdev_ms_array == 0)
+                               need_update = B_TRUE;
+
+               /*
+                * Update the config cache asychronously in case we're the
+                * root pool, in which case the config cache isn't writable yet.
+                */
+               if (need_update)
+                       spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+               /*
+                * Check all DTLs to see if anything needs resilvering.
+                */
+               if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+                   vdev_resilver_needed(rvd, NULL, NULL))
+                       spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+               /*
+                * Delete any inconsistent datasets.
+                */
+               (void) dmu_objset_find(spa_name(spa),
+                   dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+               /*
+                * Clean up any stale temporary dataset userrefs.
+                */
+               dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+       }
+
+       return (0);
+}
+
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+{
+       spa_unload(spa);
+       spa_deactivate(spa);
+
+       spa->spa_load_max_txg--;
+
+       spa_activate(spa, spa_mode_global);
+       spa_async_suspend(spa);
+
+       return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+}
+
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+    uint64_t max_request, int rewind_flags)
+{
+       nvlist_t *config = NULL;
+       int load_error, rewind_error;
+       uint64_t safe_rewind_txg;
+       uint64_t min_txg;
 
-               /*
-                * Wait for all claims to sync.
-                */
-               txg_wait_synced(spa->spa_dsl_pool, 0);
+       if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+               spa->spa_load_max_txg = spa->spa_load_txg;
+               spa_set_log_state(spa, SPA_LOG_CLEAR);
+       } else {
+               spa->spa_load_max_txg = max_request;
+       }
 
-               /*
-                * If the config cache is stale, or we have uninitialized
-                * metaslabs (see spa_vdev_add()), then update the config.
-                *
-                * If spa_load_verbatim is true, trust the current
-                * in-core spa_config and update the disk labels.
-                */
-               if (config_cache_txg != spa->spa_config_txg ||
-                   state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
-                       need_update = B_TRUE;
+       load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+           mosconfig);
+       if (load_error == 0)
+               return (0);
 
-               for (int c = 0; c < rvd->vdev_children; c++)
-                       if (rvd->vdev_child[c]->vdev_ms_array == 0)
-                               need_update = B_TRUE;
+       if (spa->spa_root_vdev != NULL)
+               config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
-               /*
-                * Update the config cache asychronously in case we're the
-                * root pool, in which case the config cache isn't writable yet.
-                */
-               if (need_update)
-                       spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+       spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+       spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
-               /*
-                * Check all DTLs to see if anything needs resilvering.
-                */
-               if (vdev_resilver_needed(rvd, NULL, NULL))
-                       spa_async_request(spa, SPA_ASYNC_RESILVER);
+       if (rewind_flags & ZPOOL_NEVER_REWIND) {
+               nvlist_free(config);
+               return (load_error);
        }
 
-       error = 0;
-out:
-       spa->spa_minref = refcount_count(&spa->spa_refcount);
-       if (error && error != EBADF)
-               zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
-       spa->spa_load_state = SPA_LOAD_NONE;
-       spa->spa_ena = 0;
+       /* Price of rolling back is discarding txgs, including log */
+       if (state == SPA_LOAD_RECOVER)
+               spa_set_log_state(spa, SPA_LOG_CLEAR);
 
-       return (error);
+       spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+       safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+       min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+           TXG_INITIAL : safe_rewind_txg;
+
+       /*
+        * Continue as long as we're finding errors, we're still within
+        * the acceptable rewind range, and we're still finding uberblocks
+        */
+       while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+           spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+               if (spa->spa_load_max_txg < safe_rewind_txg)
+                       spa->spa_extreme_rewind = B_TRUE;
+               rewind_error = spa_load_retry(spa, state, mosconfig);
+       }
+
+       if (config)
+               spa_rewind_data_to_nvlist(spa, config);
+
+       spa->spa_extreme_rewind = B_FALSE;
+       spa->spa_load_max_txg = UINT64_MAX;
+
+       if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+               spa_config_set(spa, config);
+
+       return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
 }
 
 /*
@@ -1624,7 +2198,8 @@ out:
  * ambiguous state.
  */
 static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+    nvlist_t **config)
 {
        spa_t *spa;
        int error;
@@ -1648,11 +2223,23 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
                        mutex_exit(&spa_namespace_lock);
                return (ENOENT);
        }
+
        if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+               spa_load_state_t state = SPA_LOAD_OPEN;
+               zpool_rewind_policy_t policy;
+
+               zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
+                   &policy);
+               if (policy.zrp_request & ZPOOL_DO_REWIND)
+                       state = SPA_LOAD_RECOVER;
 
                spa_activate(spa, spa_mode_global);
 
-               error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+               if (state != SPA_LOAD_RECOVER)
+                       spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+               error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+                   policy.zrp_request);
 
                if (error == EBADF) {
                        /*
@@ -1677,38 +2264,49 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
                         * information: the state of each vdev after the
                         * attempted vdev_open().  Return this to the user.
                         */
-                       if (config != NULL && spa->spa_root_vdev != NULL)
-                               *config = spa_config_generate(spa, NULL, -1ULL,
-                                   B_TRUE);
+                       if (config != NULL && spa->spa_config)
+                               VERIFY(nvlist_dup(spa->spa_config, config,
+                                   KM_SLEEP) == 0);
                        spa_unload(spa);
                        spa_deactivate(spa);
-                       spa->spa_last_open_failed = B_TRUE;
+                       spa->spa_last_open_failed = error;
                        if (locked)
                                mutex_exit(&spa_namespace_lock);
                        *spapp = NULL;
                        return (error);
-               } else {
-                       spa->spa_last_open_failed = B_FALSE;
                }
+
        }
 
        spa_open_ref(spa, tag);
 
-       if (locked)
-               mutex_exit(&spa_namespace_lock);
-
-       *spapp = spa;
 
        if (config != NULL)
                *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
+       if (locked) {
+               spa->spa_last_open_failed = 0;
+               spa->spa_last_ubsync_txg = 0;
+               spa->spa_load_txg = 0;
+               mutex_exit(&spa_namespace_lock);
+       }
+
+       *spapp = spa;
+
        return (0);
 }
 
 int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+    nvlist_t **config)
+{
+       return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
-       return (spa_open_common(name, spapp, tag, NULL));
+       return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
@@ -1779,7 +2377,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
                        if (spa_spare_exists(guid, &pool, NULL) &&
                            pool != 0ULL) {
                                VERIFY(nvlist_lookup_uint64_array(
-                                   spares[i], ZPOOL_CONFIG_STATS,
+                                   spares[i], ZPOOL_CONFIG_VDEV_STATS,
                                    (uint64_t **)&vs, &vsc) == 0);
                                vs->vs_state = VDEV_STATE_CANT_OPEN;
                                vs->vs_aux = VDEV_AUX_SPARED;
@@ -1836,7 +2434,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
                        ASSERT(vd != NULL);
 
                        VERIFY(nvlist_lookup_uint64_array(l2cache[i],
-                           ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+                           ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+                           == 0);
                        vdev_get_stats(vd, vs);
                }
        }
@@ -1849,7 +2448,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
        spa_t *spa;
 
        *config = NULL;
-       error = spa_open_common(name, &spa, FTAG, config);
+       error = spa_open_common(name, &spa, FTAG, NULL, config);
 
        if (spa != NULL) {
                /*
@@ -2093,7 +2692,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        uint64_t txg = TXG_INITIAL;
        nvlist_t **spares, **l2cache;
        uint_t nspares, nl2cache;
-       uint64_t version;
+       uint64_t version, obj;
 
        /*
         * If this pool already exists, return failure.
@@ -2109,11 +2708,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
         */
        (void) nvlist_lookup_string(props,
            zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-       spa = spa_add(pool, altroot);
+       spa = spa_add(pool, NULL, altroot);
        spa_activate(spa, spa_mode_global);
 
-       spa->spa_uberblock.ub_txg = txg - 1;
-
        if (props && (error = spa_prop_validate(spa, props))) {
                spa_deactivate(spa);
                spa_remove(spa);
@@ -2125,6 +2722,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
            &version) != 0)
                version = SPA_VERSION;
        ASSERT(version <= SPA_VERSION);
+
+       spa->spa_first_txg = txg;
+       spa->spa_uberblock.ub_txg = txg - 1;
        spa->spa_uberblock.ub_version = version;
        spa->spa_ubsync = spa->spa_uberblock;
 
@@ -2200,6 +2800,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
        spa->spa_meta_objset = dp->dp_meta_objset;
 
+       /*
+        * Create DDTs (dedup tables).
+        */
+       ddt_create(spa);
+
+       spa_update_dspace(spa);
+
        tx = dmu_tx_create_assigned(dp, txg);
 
        /*
@@ -2215,6 +2822,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                cmn_err(CE_PANIC, "failed to add pool config");
        }
 
+       if (zap_add(spa->spa_meta_objset,
+           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+           sizeof (uint64_t), 1, &version, tx) != 0) {
+               cmn_err(CE_PANIC, "failed to add pool version");
+       }
+
        /* Newly created pools with the right version are always deflated. */
        if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
                spa->spa_deflate = TRUE;
@@ -2226,20 +2839,20 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        }
 
        /*
-        * Create the deferred-free bplist object.  Turn off compression
+        * Create the deferred-free bpobj.  Turn off compression
         * because sync-to-convergence takes longer if the blocksize
         * keeps changing.
         */
-       spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
-           1 << 14, tx);
-       dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+       obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+       dmu_object_set_compress(spa->spa_meta_objset, obj,
            ZIO_COMPRESS_OFF, tx);
-
        if (zap_add(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-           sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
-               cmn_err(CE_PANIC, "failed to add bplist");
+           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+           sizeof (uint64_t), 1, &obj, tx) != 0) {
+               cmn_err(CE_PANIC, "failed to add bpobj");
        }
+       VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+           spa->spa_meta_objset, obj));
 
        /*
         * Create the pool's history object.
@@ -2254,9 +2867,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
        spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
        spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+
        if (props != NULL) {
                spa_configfile_set(spa, props, B_FALSE);
-               spa_sync_props(spa, props, CRED(), tx);
+               spa_sync_props(spa, props, tx);
        }
 
        dmu_tx_commit(tx);
@@ -2387,7 +3001,17 @@ spa_import_rootpool(char *devpath, char *devid)
        /*
         * Read the label from the boot device and generate a configuration.
         */
-       if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) {
+       config = spa_generate_rootconf(devpath, devid, &guid);
+#if defined(_OBP) && defined(_KERNEL)
+       if (config == NULL) {
+               if (strstr(devpath, "/iscsi/ssd") != NULL) {
+                       /* iscsi boot */
+                       get_iscsi_bootpath_phy(devpath);
+                       config = spa_generate_rootconf(devpath, devid, &guid);
+               }
+       }
+#endif
+       if (config == NULL) {
                cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
                    devpath);
                return (EIO);
@@ -2406,7 +3030,7 @@ spa_import_rootpool(char *devpath, char *devid)
                spa_remove(spa);
        }
 
-       spa = spa_add(pname, NULL);
+       spa = spa_add(pname, config, NULL);
        spa->spa_is_root = B_TRUE;
        spa->spa_load_verbatim = B_TRUE;
 
@@ -2462,7 +3086,6 @@ spa_import_rootpool(char *devpath, char *devid)
                goto out;
        }
 
-       VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
        error = 0;
        spa_history_log_version(spa, LOG_POOL_IMPORT);
 out:
@@ -2495,12 +3118,10 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
 
        (void) nvlist_lookup_string(props,
            zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-       spa = spa_add(pool, altroot);
+       spa = spa_add(pool, config, altroot);
 
        spa->spa_load_verbatim = B_TRUE;
 
-       VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
-
        if (props != NULL)
                spa_configfile_set(spa, props, B_FALSE);
 
@@ -2520,6 +3141,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
 {
        spa_t *spa;
        char *altroot = NULL;
+       spa_load_state_t state = SPA_LOAD_IMPORT;
+       zpool_rewind_policy_t policy;
        int error;
        nvlist_t *nvroot;
        nvlist_t **spares, **l2cache;
@@ -2529,17 +3152,21 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
         * If a pool with this name exists, return failure.
         */
        mutex_enter(&spa_namespace_lock);
-       if ((spa = spa_lookup(pool)) != NULL) {
+       if (spa_lookup(pool) != NULL) {
                mutex_exit(&spa_namespace_lock);
                return (EEXIST);
        }
 
+       zpool_get_rewind_policy(config, &policy);
+       if (policy.zrp_request & ZPOOL_DO_REWIND)
+               state = SPA_LOAD_RECOVER;
+
        /*
         * Create and initialize the spa structure.
         */
        (void) nvlist_lookup_string(props,
            zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-       spa = spa_add(pool, altroot);
+       spa = spa_add(pool, config, altroot);
        spa_activate(spa, spa_mode_global);
 
        /*
@@ -2552,7 +3179,16 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
         * because the user-supplied config is actually the one to trust when
         * doing an import.
         */
-       error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+       if (state != SPA_LOAD_RECOVER)
+               spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+       error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
+           policy.zrp_request);
+
+       /*
+        * Propagate anything learned about failing or best txgs
+        * back to caller
+        */
+       spa_rewind_data_to_nvlist(spa, config);
 
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
        /*
@@ -2592,8 +3228,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                return (error);
        }
 
-       spa_async_resume(spa);
-
        /*
         * Override any spares and level 2 cache devices as specified by
         * the user, as these may have correct device names/devids, etc.
@@ -2629,6 +3263,14 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                spa->spa_l2cache.sav_sync = B_TRUE;
        }
 
+       /*
+        * Check for any removed devices.
+        */
+       if (spa->spa_autoreplace) {
+               spa_aux_check_removed(&spa->spa_spares);
+               spa_aux_check_removed(&spa->spa_l2cache);
+       }
+
        if (spa_writeable(spa)) {
                /*
                 * Update the config cache to include the newly-imported pool.
@@ -2636,6 +3278,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
        }
 
+       spa_async_resume(spa);
+
        /*
         * It's possible that the pool was expanded while it was exported.
         * We kick off an async task to handle this for us.
@@ -2648,13 +3292,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
        return (0);
 }
 
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define        TRYIMPORT_NAME  "$import"
-
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
@@ -2674,7 +3311,7 @@ spa_tryimport(nvlist_t *tryconfig)
         * Create and initialize the spa structure.
         */
        mutex_enter(&spa_namespace_lock);
-       spa = spa_add(TRYIMPORT_NAME, NULL);
+       spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
        spa_activate(spa, FREAD);
 
        /*
@@ -2682,7 +3319,7 @@ spa_tryimport(nvlist_t *tryconfig)
         * Pass TRUE for mosconfig because the user-supplied config
         * is actually the one to trust when doing an import.
         */
-       error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+       error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
 
        /*
         * If 'tryconfig' was at least parsable, return the current config.
@@ -2827,7 +3464,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
                if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
                        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
                        spa->spa_state = new_state;
-                       spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+                       spa->spa_final_txg = spa_last_synced_txg(spa) +
+                           TXG_DEFER_SIZE + 1;
                        vdev_config_dirty(spa->spa_root_vdev);
                        spa_config_exit(spa, SCL_ALL, FTAG);
                }
@@ -2897,7 +3535,7 @@ spa_reset(char *pool)
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
-       uint64_t txg;
+       uint64_t txg, id;
        int error;
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *vd, *tvd;
@@ -2938,9 +3576,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
         * Transfer each new top-level vdev from vd to rvd.
         */
        for (int c = 0; c < vd->vdev_children; c++) {
+
+               /*
+                * Set the vdev id to the first hole, if one exists.
+                */
+               for (id = 0; id < rvd->vdev_children; id++) {
+                       if (rvd->vdev_child[id]->vdev_ishole) {
+                               vdev_free(rvd->vdev_child[id]);
+                               break;
+                       }
+               }
                tvd = vd->vdev_child[c];
                vdev_remove_child(vd, tvd);
-               tvd->vdev_id = rvd->vdev_children;
+               tvd->vdev_id = id;
                vdev_add_child(rvd, tvd);
                vdev_config_dirty(tvd);
        }
@@ -2997,7 +3645,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
-       uint64_t txg, open_txg;
+       uint64_t txg, dtl_max_txg;
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
        vdev_ops_t *pvops;
@@ -3123,6 +3771,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
         */
        vdev_remove_child(newrootvd, newvd);
        newvd->vdev_id = pvd->vdev_children;
+       newvd->vdev_crtxg = oldvd->vdev_crtxg;
        vdev_add_child(pvd, newvd);
 
        tvd = newvd->vdev_top;
@@ -3132,13 +3781,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        vdev_config_dirty(tvd);
 
        /*
-        * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
-        * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+        * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+        * for any dmu_sync-ed blocks.  It will propagate upward when
+        * spa_vdev_exit() calls vdev_dtl_reassess().
         */
-       open_txg = txg + TXG_CONCURRENT_STATES - 1;
+       dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-       vdev_dtl_dirty(newvd, DTL_MISSING,
-           TXG_INITIAL, open_txg - TXG_INITIAL + 1);
+       vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+           dtl_max_txg - TXG_INITIAL);
 
        if (newvd->vdev_isspare) {
                spa_spare_activate(newvd);
@@ -3154,10 +3804,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
         */
        vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
-       (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+       /*
+        * Restart the resilver
+        */
+       dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+       /*
+        * Commit the config
+        */
+       (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
-       spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
-           CRED(),  "%s vdev=%s %s vdev=%s",
+       spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
+           "%s vdev=%s %s vdev=%s",
            replacing && newvd_isspare ? "spare in" :
            replacing ? "replace" : "attach", newvdpath,
            replacing ? "for" : "to", oldvdpath);
@@ -3165,11 +3823,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        spa_strfree(oldvdpath);
        spa_strfree(newvdpath);
 
-       /*
-        * Kick off a resilver to update newvd.
-        */
-       VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
        return (0);
 }
 
@@ -3188,6 +3841,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
        boolean_t unspare = B_FALSE;
        uint64_t unspare_guid;
        size_t len;
+       char *vdpath;
 
        txg = spa_vdev_enter(spa);
 
@@ -3315,77 +3969,366 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
                (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
        }
 
-       /*
-        * If the parent mirror/replacing vdev only has one child,
-        * the parent is no longer needed.  Remove it from the tree.
-        */
-       if (pvd->vdev_children == 1)
-               vdev_remove_parent(cvd);
+       /*
+        * If the parent mirror/replacing vdev only has one child,
+        * the parent is no longer needed.  Remove it from the tree.
+        */
+       if (pvd->vdev_children == 1)
+               vdev_remove_parent(cvd);
+
+       /*
+        * We don't set tvd until now because the parent we just removed
+        * may have been the previous top-level vdev.
+        */
+       tvd = cvd->vdev_top;
+       ASSERT(tvd->vdev_parent == rvd);
+
+       /*
+        * Reevaluate the parent vdev state.
+        */
+       vdev_propagate_state(cvd);
+
+       /*
+        * If the 'autoexpand' property is set on the pool then automatically
+        * try to expand the size of the pool. For example if the device we
+        * just detached was smaller than the others, it may be possible to
+        * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+        * first so that we can obtain the updated sizes of the leaf vdevs.
+        */
+       if (spa->spa_autoexpand) {
+               vdev_reopen(tvd);
+               vdev_expand(tvd, txg);
+       }
+
+       vdev_config_dirty(tvd);
+
+       /*
+        * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
+        * vd->vdev_detached is set and free vd's DTL object in syncing context.
+        * But first make sure we're not on any *other* txg's DTL list, to
+        * prevent vd from being accessed after it's freed.
+        */
+       vdpath = spa_strdup(vd->vdev_path);
+       for (int t = 0; t < TXG_SIZE; t++)
+               (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+       vd->vdev_detached = B_TRUE;
+       vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+       spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
+       error = spa_vdev_exit(spa, vd, txg, 0);
+
+       spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
+           "vdev=%s", vdpath);
+       spa_strfree(vdpath);
+
+       /*
+        * If this was the removal of the original device in a hot spare vdev,
+        * then we want to go through and remove the device from the hot spare
+        * list of every other pool.
+        */
+       if (unspare) {
+               spa_t *myspa = spa;
+               spa = NULL;
+               mutex_enter(&spa_namespace_lock);
+               while ((spa = spa_next(spa)) != NULL) {
+                       if (spa->spa_state != POOL_STATE_ACTIVE)
+                               continue;
+                       if (spa == myspa)
+                               continue;
+                       spa_open_ref(spa, FTAG);
+                       mutex_exit(&spa_namespace_lock);
+                       (void) spa_vdev_remove(spa, unspare_guid,
+                           B_TRUE);
+                       mutex_enter(&spa_namespace_lock);
+                       spa_close(spa, FTAG);
+               }
+               mutex_exit(&spa_namespace_lock);
+       }
+
+       return (error);
+}
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+    nvlist_t *props, boolean_t exp)
+{
+       int error = 0;
+       uint64_t txg, *glist;
+       spa_t *newspa;
+       uint_t c, children, lastlog;
+       nvlist_t **child, *nvl, *tmp;
+       dmu_tx_t *tx;
+       char *altroot = NULL;
+       vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
+       boolean_t activate_slog;
+
+       if (!spa_writeable(spa))
+               return (EROFS);
+
+       txg = spa_vdev_enter(spa);
+
+       /* clear the log and flush everything up to now */
+       activate_slog = spa_passivate_log(spa);
+       (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+       error = spa_offline_log(spa);
+       txg = spa_vdev_config_enter(spa);
+
+       if (activate_slog)
+               spa_activate_log(spa);
+
+       if (error != 0)
+               return (spa_vdev_exit(spa, NULL, txg, error));
+
+       /* check new spa name before going any further */
+       if (spa_lookup(newname) != NULL)
+               return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+       /*
+        * scan through all the children to ensure they're all mirrors
+        */
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+           nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+           &children) != 0)
+               return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+       /* first, check to ensure we've got the right child count */
+       rvd = spa->spa_root_vdev;
+       lastlog = 0;
+       for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+
+               /* don't count the holes & logs as children */
+               if (vd->vdev_islog || vd->vdev_ishole) {
+                       if (lastlog == 0)
+                               lastlog = c;
+                       continue;
+               }
+
+               lastlog = 0;
+       }
+       if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+               return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+       /* next, ensure no spare or cache devices are part of the split */
+       if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+           nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+               return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+       vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+       glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+       /* then, loop over each vdev and validate it */
+       for (c = 0; c < children; c++) {
+               uint64_t is_hole = 0;
+
+               (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+                   &is_hole);
+
+               if (is_hole != 0) {
+                       if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+                           spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+                               continue;
+                       } else {
+                               error = EINVAL;
+                               break;
+                       }
+               }
+
+               /* which disk is going to be split? */
+               if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+                   &glist[c]) != 0) {
+                       error = EINVAL;
+                       break;
+               }
+
+               /* look it up in the spa */
+               vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+               if (vml[c] == NULL) {
+                       error = ENODEV;
+                       break;
+               }
+
+               /* make sure there's nothing stopping the split */
+               if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+                   vml[c]->vdev_islog ||
+                   vml[c]->vdev_ishole ||
+                   vml[c]->vdev_isspare ||
+                   vml[c]->vdev_isl2cache ||
+                   !vdev_writeable(vml[c]) ||
+                   vml[c]->vdev_children != 0 ||
+                   vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+                   c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+                       error = EINVAL;
+                       break;
+               }
+
+               if (vdev_dtl_required(vml[c])) {
+                       error = EBUSY;
+                       break;
+               }
+
+               /* we need certain info from the top level */
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+                   vml[c]->vdev_top->vdev_ms_array) == 0);
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+                   vml[c]->vdev_top->vdev_ms_shift) == 0);
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+                   vml[c]->vdev_top->vdev_asize) == 0);
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+                   vml[c]->vdev_top->vdev_ashift) == 0);
+       }
+
+       if (error != 0) {
+               kmem_free(vml, children * sizeof (vdev_t *));
+               kmem_free(glist, children * sizeof (uint64_t));
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+
+       /* stop writers from using the disks */
+       for (c = 0; c < children; c++) {
+               if (vml[c] != NULL)
+                       vml[c]->vdev_offline = B_TRUE;
+       }
+       vdev_reopen(spa->spa_root_vdev);
 
        /*
-        * We don't set tvd until now because the parent we just removed
-        * may have been the previous top-level vdev.
+        * Temporarily record the splitting vdevs in the spa config.  This
+        * will disappear once the config is regenerated.
         */
-       tvd = cvd->vdev_top;
-       ASSERT(tvd->vdev_parent == rvd);
+       VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+           glist, children) == 0);
+       kmem_free(glist, children * sizeof (uint64_t));
 
-       /*
-        * Reevaluate the parent vdev state.
-        */
-       vdev_propagate_state(cvd);
+       mutex_enter(&spa->spa_props_lock);
+       VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+           nvl) == 0);
+       mutex_exit(&spa->spa_props_lock);
+       spa->spa_config_splitting = nvl;
+       vdev_config_dirty(spa->spa_root_vdev);
 
-       /*
-        * If the 'autoexpand' property is set on the pool then automatically
-        * try to expand the size of the pool. For example if the device we
-        * just detached was smaller than the others, it may be possible to
-        * add metaslabs (i.e. grow the pool). We need to reopen the vdev
-        * first so that we can obtain the updated sizes of the leaf vdevs.
-        */
-       if (spa->spa_autoexpand) {
-               vdev_reopen(tvd);
-               vdev_expand(tvd, txg);
+       /* configure and create the new pool */
+       VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+           exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+           spa_version(spa)) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+           spa->spa_config_txg) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+           spa_generate_guid(NULL)) == 0);
+       (void) nvlist_lookup_string(props,
+           zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+
+       /* add the new pool to the namespace */
+       newspa = spa_add(newname, config, altroot);
+       newspa->spa_config_txg = spa->spa_config_txg;
+       spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+       /* release the spa config lock, retaining the namespace lock */
+       spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, FTAG, 1);
+
+       spa_activate(newspa, spa_mode_global);
+       spa_async_suspend(newspa);
+
+       /* create the new pool from the disks of the original pool */
+       error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+       if (error)
+               goto out;
+
+       /* if that worked, generate a real config for the new pool */
+       if (newspa->spa_root_vdev != NULL) {
+               VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+                   NV_UNIQUE_NAME, KM_SLEEP) == 0);
+               VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+                   ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+               spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+                   B_TRUE));
        }
 
-       vdev_config_dirty(tvd);
+       /* set the props */
+       if (props != NULL) {
+               spa_configfile_set(newspa, props, B_FALSE);
+               error = spa_prop_set(newspa, props);
+               if (error)
+                       goto out;
+       }
 
-       /*
-        * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
-        * vd->vdev_detached is set and free vd's DTL object in syncing context.
-        * But first make sure we're not on any *other* txg's DTL list, to
-        * prevent vd from being accessed after it's freed.
-        */
-       for (int t = 0; t < TXG_SIZE; t++)
-               (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
-       vd->vdev_detached = B_TRUE;
-       vdev_dirty(tvd, VDD_DTL, vd, txg);
+       /* flush everything */
+       txg = spa_vdev_config_enter(newspa);
+       vdev_config_dirty(newspa->spa_root_vdev);
+       (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
-       spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, FTAG, 2);
 
-       error = spa_vdev_exit(spa, vd, txg, 0);
+       spa_async_resume(newspa);
 
-       /*
-        * If this was the removal of the original device in a hot spare vdev,
-        * then we want to go through and remove the device from the hot spare
-        * list of every other pool.
-        */
-       if (unspare) {
-               spa_t *myspa = spa;
-               spa = NULL;
-               mutex_enter(&spa_namespace_lock);
-               while ((spa = spa_next(spa)) != NULL) {
-                       if (spa->spa_state != POOL_STATE_ACTIVE)
-                               continue;
-                       if (spa == myspa)
-                               continue;
-                       spa_open_ref(spa, FTAG);
-                       mutex_exit(&spa_namespace_lock);
-                       (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
-                       mutex_enter(&spa_namespace_lock);
-                       spa_close(spa, FTAG);
+       /* finally, update the original pool's config */
+       txg = spa_vdev_config_enter(spa);
+       tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       error = dmu_tx_assign(tx, TXG_WAIT);
+       if (error != 0)
+               dmu_tx_abort(tx);
+       for (c = 0; c < children; c++) {
+               if (vml[c] != NULL) {
+                       vdev_split(vml[c]);
+                       if (error == 0)
+                               spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+                                   spa, tx, "vdev=%s",
+                                   vml[c]->vdev_path);
+                       vdev_free(vml[c]);
                }
-               mutex_exit(&spa_namespace_lock);
        }
+       vdev_config_dirty(spa->spa_root_vdev);
+       spa->spa_config_splitting = NULL;
+       nvlist_free(nvl);
+       if (error == 0)
+               dmu_tx_commit(tx);
+       (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, FTAG, 3);
+
+       /* split is complete; log a history record */
+       spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
+           "split new pool %s from pool %s", newname, spa_name(spa));
+
+       kmem_free(vml, children * sizeof (vdev_t *));
+
+       /* if we're not going to mount the filesystems in userland, export */
+       if (exp)
+               error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+                   B_FALSE, B_FALSE);
+
+       return (error);
+
+out:
+       spa_unload(newspa);
+       spa_deactivate(newspa);
+       spa_remove(newspa);
+
+       txg = spa_vdev_config_enter(spa);
+
+       /* re-online all offlined disks */
+       for (c = 0; c < children; c++) {
+               if (vml[c] != NULL)
+                       vml[c]->vdev_offline = B_FALSE;
+       }
+       vdev_reopen(spa->spa_root_vdev);
+
+       nvlist_free(spa->spa_config_splitting);
+       spa->spa_config_splitting = NULL;
+       (void) spa_vdev_exit(spa, NULL, txg, error);
 
+       kmem_free(vml, children * sizeof (vdev_t *));
        return (error);
 }
 
@@ -3431,16 +4374,113 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 }
 
 /*
+ * Evacuate the device.
+ */
+static int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+       uint64_t txg;
+       int error = 0;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+       ASSERT(vd == vd->vdev_top);
+
+       /*
+        * Evacuate the device.  We don't hold the config lock as writer
+        * since we need to do I/O but we do keep the
+        * spa_namespace_lock held.  Once this completes the device
+        * should no longer have any blocks allocated on it.
+        */
+       if (vd->vdev_islog) {
+               if (vd->vdev_stat.vs_alloc != 0)
+                       error = spa_offline_log(spa);
+       } else {
+               error = ENOTSUP;
+       }
+
+       if (error)
+               return (error);
+
+       /*
+        * The evacuation succeeded.  Remove any remaining MOS metadata
+        * associated with this vdev, and wait for these changes to sync.
+        */
+       ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+       txg = spa_vdev_config_enter(spa);
+       vd->vdev_removing = B_TRUE;
+       vdev_dirty(vd, 0, NULL, txg);
+       vdev_config_dirty(vd);
+       spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+       return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+static void
+spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t id = vd->vdev_id;
+       boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+       ASSERT(vd == vd->vdev_top);
+
+       /*
+        * Only remove any devices which are empty.
+        */
+       if (vd->vdev_stat.vs_alloc != 0)
+               return;
+
+       (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+       if (list_link_active(&vd->vdev_state_dirty_node))
+               vdev_state_clean(vd);
+       if (list_link_active(&vd->vdev_config_dirty_node))
+               vdev_config_clean(vd);
+
+       vdev_free(vd);
+
+       if (last_vdev) {
+               vdev_compact_children(rvd);
+       } else {
+               vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+               vdev_add_child(rvd, vd);
+       }
+       vdev_config_dirty(rvd);
+
+       /*
+        * Reassess the health of our root vdev.
+        */
+       vdev_reopen(rvd);
+}
+
+/*
+ * Remove a device from the pool -
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time.  As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock.  During each step the configuration is synced out.
+ */
+
+/*
  * Remove a device from the pool.  Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
        vdev_t *vd;
+       metaslab_group_t *mg;
        nvlist_t **spares, **l2cache, *nv;
-       uint_t nspares, nl2cache;
        uint64_t txg = 0;
+       uint_t nspares, nl2cache;
        int error = 0;
        boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
@@ -3476,6 +4516,49 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
                spa_load_l2cache(spa);
                spa->spa_l2cache.sav_sync = B_TRUE;
+       } else if (vd != NULL && vd->vdev_islog) {
+               ASSERT(!locked);
+               ASSERT(vd == vd->vdev_top);
+
+               /*
+                * XXX - Once we have bp-rewrite this should
+                * become the common case.
+                */
+
+               mg = vd->vdev_mg;
+
+               /*
+                * Stop allocating from this vdev.
+                */
+               metaslab_group_passivate(mg);
+
+               /*
+                * Wait for the youngest allocations and frees to sync,
+                * and then wait for the deferral of those frees to finish.
+                */
+               spa_vdev_config_exit(spa, NULL,
+                   txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+               /*
+                * Attempt to evacuate the vdev.
+                */
+               error = spa_vdev_remove_evacuate(spa, vd);
+
+               txg = spa_vdev_config_enter(spa);
+
+               /*
+                * If we couldn't evacuate the vdev, unwind.
+                */
+               if (error) {
+                       metaslab_group_activate(mg);
+                       return (spa_vdev_exit(spa, NULL, txg, error));
+               }
+
+               /*
+                * Clean up the vdev namespace.
+                */
+               spa_vdev_remove_from_namespace(spa, vd);
+
        } else if (vd != NULL) {
                /*
                 * Normal vdevs cannot be removed (yet).
@@ -3517,6 +4600,7 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
                newvd = vd->vdev_child[1];
 
                if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+                   vdev_dtl_empty(newvd, DTL_OUTAGE) &&
                    !vdev_dtl_required(oldvd))
                        return (oldvd);
        }
@@ -3530,6 +4614,7 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 
                if (newvd->vdev_unspare &&
                    vdev_dtl_empty(newvd, DTL_MISSING) &&
+                   vdev_dtl_empty(newvd, DTL_OUTAGE) &&
                    !vdev_dtl_required(oldvd)) {
                        newvd->vdev_unspare = 0;
                        return (oldvd);
@@ -3576,36 +4661,41 @@ spa_vdev_resilver_done(spa_t *spa)
 }
 
 /*
- * Update the stored path or FRU for this vdev.  Dirty the vdev configuration,
- * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
+ * Update the stored path or FRU for this vdev.
  */
 int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
        vdev_t *vd;
-       uint64_t txg;
+       boolean_t sync = B_FALSE;
 
-       txg = spa_vdev_enter(spa);
+       spa_vdev_state_enter(spa, SCL_ALL);
 
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
-               return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+               return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
        if (!vd->vdev_ops->vdev_op_leaf)
-               return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+               return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
        if (ispath) {
-               spa_strfree(vd->vdev_path);
-               vd->vdev_path = spa_strdup(value);
+               if (strcmp(value, vd->vdev_path) != 0) {
+                       spa_strfree(vd->vdev_path);
+                       vd->vdev_path = spa_strdup(value);
+                       sync = B_TRUE;
+               }
        } else {
-               if (vd->vdev_fru != NULL)
+               if (vd->vdev_fru == NULL) {
+                       vd->vdev_fru = spa_strdup(value);
+                       sync = B_TRUE;
+               } else if (strcmp(value, vd->vdev_fru) != 0) {
                        spa_strfree(vd->vdev_fru);
-               vd->vdev_fru = spa_strdup(value);
+                       vd->vdev_fru = spa_strdup(value);
+                       sync = B_TRUE;
+               }
        }
 
-       vdev_config_dirty(vd->vdev_top);
-
-       return (spa_vdev_exit(spa, NULL, txg, 0));
+       return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
@@ -3622,40 +4712,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 
 /*
  * ==========================================================================
- * SPA Scrubbing
+ * SPA Scanning
  * ==========================================================================
  */
 
 int
-spa_scrub(spa_t *spa, pool_scrub_type_t type)
+spa_scan_stop(spa_t *spa)
+{
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+       if (dsl_scan_resilvering(spa->spa_dsl_pool))
+               return (EBUSY);
+       return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
+
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
 {
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
-       if ((uint_t)type >= POOL_SCRUB_TYPES)
+       if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
                return (ENOTSUP);
 
        /*
         * If a resilver was requested, but there is no DTL on a
         * writeable leaf device, we have nothing to do.
         */
-       if (type == POOL_SCRUB_RESILVER &&
+       if (func == POOL_SCAN_RESILVER &&
            !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
                spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
                return (0);
        }
 
-       if (type == POOL_SCRUB_EVERYTHING &&
-           spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
-           spa->spa_dsl_pool->dp_scrub_isresilver)
-               return (EBUSY);
-
-       if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
-               return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
-       } else if (type == POOL_SCRUB_NONE) {
-               return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
-       } else {
-               return (EINVAL);
-       }
+       return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
@@ -3668,9 +4756,20 @@ static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
        if (vd->vdev_remove_wanted) {
-               vd->vdev_remove_wanted = 0;
+               vd->vdev_remove_wanted = B_FALSE;
+               vd->vdev_delayed_close = B_FALSE;
                vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
-               vdev_clear(spa, vd);
+
+               /*
+                * We want to clear the stats, but we don't want to do a full
+                * vdev_clear() as that will cause us to throw away
+                * degraded/faulted state as well as attempt to reopen the
+                * device, all of which is a waste.
+                */
+               vd->vdev_stat.vs_read_errors = 0;
+               vd->vdev_stat.vs_write_errors = 0;
+               vd->vdev_stat.vs_checksum_errors = 0;
+
                vdev_state_dirty(vd->vdev_top);
        }
 
@@ -3682,7 +4781,7 @@ static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
        if (vd->vdev_probe_wanted) {
-               vd->vdev_probe_wanted = 0;
+               vd->vdev_probe_wanted = B_FALSE;
                vdev_reopen(vd);        /* vdev_open() does the actual probe */
        }
 
@@ -3737,24 +4836,23 @@ spa_async_thread(spa_t *spa)
         * See if the config needs to be updated.
         */
        if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
-               uint64_t oldsz, space_update;
+               uint64_t old_space, new_space;
 
                mutex_enter(&spa_namespace_lock);
-               oldsz = spa_get_space(spa);
+               old_space = metaslab_class_get_space(spa_normal_class(spa));
                spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-               space_update = spa_get_space(spa) - oldsz;
+               new_space = metaslab_class_get_space(spa_normal_class(spa));
                mutex_exit(&spa_namespace_lock);
 
                /*
                 * If the pool grew as a result of the config update,
                 * then log an internal history event.
                 */
-               if (space_update) {
-                       spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
-                           spa, NULL, CRED(),
+               if (new_space != old_space) {
+                       spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
+                           spa, NULL,
                            "pool '%s' size: %llu(+%llu)",
-                           spa_name(spa), spa_get_space(spa),
-                           space_update);
+                           spa_name(spa), new_space, new_space - old_space);
                }
        }
 
@@ -3762,7 +4860,7 @@ spa_async_thread(spa_t *spa)
         * See if any devices need to be marked REMOVED.
         */
        if (tasks & SPA_ASYNC_REMOVE) {
-               spa_vdev_state_enter(spa);
+               spa_vdev_state_enter(spa, SCL_NONE);
                spa_async_remove(spa, spa->spa_root_vdev);
                for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
                        spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
@@ -3781,7 +4879,7 @@ spa_async_thread(spa_t *spa)
         * See if any devices need to be probed.
         */
        if (tasks & SPA_ASYNC_PROBE) {
-               spa_vdev_state_enter(spa);
+               spa_vdev_state_enter(spa, SCL_NONE);
                spa_async_probe(spa, spa->spa_root_vdev);
                (void) spa_vdev_state_exit(spa, NULL, 0);
        }
@@ -3796,7 +4894,7 @@ spa_async_thread(spa_t *spa)
         * Kick off a resilver.
         */
        if (tasks & SPA_ASYNC_RESILVER)
-               VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
+               dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
        /*
         * Let the world know that we're done.
@@ -3842,6 +4940,7 @@ spa_async_dispatch(spa_t *spa)
 void
 spa_async_request(spa_t *spa, int task)
 {
+       zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
        mutex_enter(&spa->spa_async_lock);
        spa->spa_async_tasks |= task;
        mutex_exit(&spa->spa_async_lock);
@@ -3853,37 +4952,22 @@ spa_async_request(spa_t *spa, int task)
  * ==========================================================================
  */
 
-static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
-       bplist_t *bpl = &spa->spa_sync_bplist;
-       dmu_tx_t *tx;
-       blkptr_t blk;
-       uint64_t itor = 0;
-       zio_t *zio;
-       int error;
-       uint8_t c = 1;
-
-       zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
-       while (bplist_iterate(bpl, &itor, &blk) == 0) {
-               ASSERT(blk.blk_birth < txg);
-               zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
-                   ZIO_FLAG_MUSTSUCCEED));
-       }
-
-       error = zio_wait(zio);
-       ASSERT3U(error, ==, 0);
+       bpobj_t *bpo = arg;
+       bpobj_enqueue(bpo, bp, tx);
+       return (0);
+}
 
-       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-       bplist_vacate(bpl, tx);
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       zio_t *zio = arg;
 
-       /*
-        * Pre-dirty the first block so we sync to convergence faster.
-        * (Usually only the first block is needed.)
-        */
-       dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
-       dmu_tx_commit(tx);
+       zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+           zio->io_flags));
+       return (0);
 }
 
 static void
@@ -3950,7 +5034,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
                list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
                for (i = 0; i < sav->sav_count; i++)
                        list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
-                           B_FALSE, B_FALSE, B_TRUE);
+                           B_FALSE, VDEV_CONFIG_L2CACHE);
                VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
                    sav->sav_count) == 0);
                for (i = 0; i < sav->sav_count; i++)
@@ -3990,7 +5074,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
  * Set zpool properties.
  */
 static void
-spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        spa_t *spa = arg1;
        objset_t *mos = spa->spa_meta_objset;
@@ -4041,8 +5125,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                         * Set pool property values in the poolprops mos object.
                         */
                        if (spa->spa_pool_props_object == 0) {
-                               objset_t *mos = spa->spa_meta_objset;
-
                                VERIFY((spa->spa_pool_props_object =
                                    zap_create(mos, DMU_OT_POOL_PROPS,
                                    DMU_OT_NONE, 0, tx)) > 0);
@@ -4091,7 +5173,12 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                                break;
                        case ZPOOL_PROP_AUTOEXPAND:
                                spa->spa_autoexpand = intval;
-                               spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+                               if (tx->tx_txg != TXG_INITIAL)
+                                       spa_async_request(spa,
+                                           SPA_ASYNC_AUTOEXPAND);
+                               break;
+                       case ZPOOL_PROP_DEDUPDITTO:
+                               spa->spa_dedup_ditto = intval;
                                break;
                        default:
                                break;
@@ -4101,8 +5188,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                /* log internal history if this is not a zpool create */
                if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
                    tx->tx_txg != TXG_INITIAL) {
-                       spa_history_internal_log(LOG_POOL_PROPSET,
-                           spa, tx, cr, "%s %lld %s",
+                       spa_history_log_internal(LOG_POOL_PROPSET,
+                           spa, tx, "%s %lld %s",
                            nvpair_name(elem), intval, spa_name(spa));
                }
        }
@@ -4111,6 +5198,42 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 /*
+ * Perform one-time upgrade on-disk changes.  spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+
+       ASSERT(spa->spa_sync_pass == 1);
+
+       if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+           spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+               dsl_pool_create_origin(dp, tx);
+
+               /* Keeping the origin open increases spa_minref */
+               spa->spa_minref += 3;
+       }
+
+       if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+           spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+               dsl_pool_upgrade_clones(dp, tx);
+       }
+
+       if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+           spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+               dsl_pool_upgrade_dir_clones(dp, tx);
+
+               /* Keeping the freedir open increases spa_minref */
+               spa->spa_minref += 3;
+       }
+}
+
+/*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
@@ -4119,11 +5242,11 @@ spa_sync(spa_t *spa, uint64_t txg)
 {
        dsl_pool_t *dp = spa->spa_dsl_pool;
        objset_t *mos = spa->spa_meta_objset;
-       bplist_t *bpl = &spa->spa_sync_bplist;
+       bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
+       bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *vd;
        dmu_tx_t *tx;
-       int dirty_vdevs;
        int error;
 
        /*
@@ -4158,8 +5281,6 @@ spa_sync(spa_t *spa, uint64_t txg)
        }
        spa_config_exit(spa, SCL_STATE, FTAG);
 
-       VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
-
        tx = dmu_tx_create_assigned(dp, txg);
 
        /*
@@ -4183,34 +5304,29 @@ spa_sync(spa_t *spa, uint64_t txg)
                }
        }
 
-       if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
-           spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
-               dsl_pool_create_origin(dp, tx);
-
-               /* Keeping the origin open increases spa_minref */
-               spa->spa_minref += 3;
-       }
-
-       if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
-           spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
-               dsl_pool_upgrade_clones(dp, tx);
-       }
-
        /*
-        * If anything has changed in this txg, push the deferred frees
-        * from the previous txg.  If not, leave them alone so that we
-        * don't generate work on an otherwise idle system.
+        * If anything has changed in this txg, or if someone is waiting
+        * for this txg to sync (eg, spa_vdev_remove()), push the
+        * deferred frees from the previous txg.  If not, leave them
+        * alone so that we don't generate work on an otherwise idle
+        * system.
         */
        if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
            !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
-           !txg_list_empty(&dp->dp_sync_tasks, txg))
-               spa_sync_deferred_frees(spa, txg);
+           !txg_list_empty(&dp->dp_sync_tasks, txg) ||
+           ((dsl_scan_active(dp->dp_scan) ||
+           txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
+               zio_t *zio = zio_root(spa, NULL, NULL, 0);
+               VERIFY3U(bpobj_iterate(defer_bpo,
+                   spa_free_sync_cb, zio, tx), ==, 0);
+               VERIFY3U(zio_wait(zio), ==, 0);
+       }
 
        /*
         * Iterate to convergence.
         */
        do {
-               spa->spa_sync_pass++;
+               int pass = ++spa->spa_sync_pass;
 
                spa_sync_config_object(spa, tx);
                spa_sync_aux_dev(spa, &spa->spa_spares, tx,
@@ -4220,18 +5336,26 @@ spa_sync(spa_t *spa, uint64_t txg)
                spa_errlog_sync(spa, txg);
                dsl_pool_sync(dp, txg);
 
-               dirty_vdevs = 0;
-               while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
-                       vdev_sync(vd, txg);
-                       dirty_vdevs++;
+               if (pass <= SYNC_PASS_DEFERRED_FREE) {
+                       zio_t *zio = zio_root(spa, NULL, NULL, 0);
+                       bplist_iterate(free_bpl, spa_free_sync_cb,
+                           zio, tx);
+                       VERIFY(zio_wait(zio) == 0);
+               } else {
+                       bplist_iterate(free_bpl, bpobj_enqueue_cb,
+                           defer_bpo, tx);
                }
 
-               bplist_sync(bpl, tx);
-       } while (dirty_vdevs);
+               ddt_sync(spa, txg);
+               dsl_scan_sync(dp, tx);
 
-       bplist_close(bpl);
+               while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+                       vdev_sync(vd, txg);
+
+               if (pass == 1)
+                       spa_sync_upgrades(spa, tx);
 
-       dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+       } while (dmu_objset_is_dirty(mos, txg));
 
        /*
         * Rewrite the vdev configuration (which includes the uberblock)
@@ -4302,10 +5426,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 
        spa->spa_ubsync = spa->spa_uberblock;
 
-       /*
-        * Clean up the ZIL records for the synced txg.
-        */
-       dsl_pool_zil_clean(dp);
+       dsl_pool_sync_done(dp, txg);
 
        /*
         * Update usable space statistics.
@@ -4313,6 +5434,8 @@ spa_sync(spa_t *spa, uint64_t txg)
        while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
                vdev_sync_done(vd, txg);
 
+       spa_update_dspace(spa);
+
        /*
         * It had better be the case that we didn't dirty anything
         * since vdev_config_sync().
@@ -4320,10 +5443,13 @@ spa_sync(spa_t *spa, uint64_t txg)
        ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
        ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
        ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-       ASSERT(bpl->bpl_queue == NULL);
+
+       spa->spa_sync_pass = 0;
 
        spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+       spa_handle_ignored_writes(spa);
+
        /*
         * If any async tasks have been requested, kick them off.
         */
index 49e9e50..a6cdb81 100644 (file)
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/sunddi.h>
 
index b2063bb..cdeda3f 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -75,7 +74,6 @@ spa_config_load(void)
        void *buf = NULL;
        nvlist_t *nvlist, *child;
        nvpair_t *nvpair;
-       spa_t *spa;
        char *pathname;
        struct _buf *file;
        uint64_t fsize;
@@ -119,7 +117,6 @@ spa_config_load(void)
        mutex_enter(&spa_namespace_lock);
        nvpair = NULL;
        while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
                if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
                        continue;
 
@@ -127,13 +124,7 @@ spa_config_load(void)
 
                if (spa_lookup(nvpair_name(nvpair)) != NULL)
                        continue;
-               spa = spa_add(nvpair_name(nvpair), NULL);
-
-               /*
-                * We blindly duplicate the configuration here.  If it's
-                * invalid, we will catch it when the pool is first opened.
-                */
-               VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+               (void) spa_add(nvpair_name(nvpair), child, NULL);
        }
        mutex_exit(&spa_namespace_lock);
 
@@ -313,6 +304,24 @@ spa_config_set(spa_t *spa, nvlist_t *config)
        mutex_exit(&spa->spa_props_lock);
 }
 
+/* Add discovered rewind info, if any to the provided nvlist */
+void
+spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl)
+{
+       int64_t loss = 0;
+
+       if (tonvl == NULL || spa->spa_load_txg == 0)
+               return;
+
+       VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME,
+           spa->spa_load_txg_ts) == 0);
+       if (spa->spa_last_ubsync_txg)
+               loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+       VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+       VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+           spa->spa_load_data_errors) == 0);
+}
+
 /*
  * Generate the pool's configuration based on the current in-core state.
  * We infer whether to generate a complete config or just one top-level config
@@ -325,6 +334,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
        vdev_t *rvd = spa->spa_root_vdev;
        unsigned long hostid = 0;
        boolean_t locked = B_FALSE;
+       uint64_t split_guid;
 
        if (vd == NULL) {
                vd = rvd;
@@ -381,12 +391,66 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
                        VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
                            1ULL) == 0);
                vd = vd->vdev_top;              /* label contains top config */
+       } else {
+               /*
+                * Only add the (potentially large) split information
+                * in the mos config, and not in the vdev labels
+                */
+               if (spa->spa_config_splitting != NULL)
+                       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+                           spa->spa_config_splitting) == 0);
+       }
+
+       /*
+        * Add the top-level config.  We even add this on pools which
+        * don't support holes in the namespace as older pools will
+        * just ignore it.
+        */
+       vdev_top_config_generate(spa, config);
+
+       /*
+        * If we're splitting, record the original pool's guid.
+        */
+       if (spa->spa_config_splitting != NULL &&
+           nvlist_lookup_uint64(spa->spa_config_splitting,
+           ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+               VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+                   split_guid) == 0);
        }
 
-       nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
+       nvroot = vdev_config_generate(spa, vd, getstats, 0);
        VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
        nvlist_free(nvroot);
 
+       if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+               ddt_histogram_t *ddh;
+               ddt_stat_t *dds;
+               ddt_object_t *ddo;
+
+               ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+               ddt_get_dedup_histogram(spa, ddh);
+               VERIFY(nvlist_add_uint64_array(config,
+                   ZPOOL_CONFIG_DDT_HISTOGRAM,
+                   (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0);
+               kmem_free(ddh, sizeof (ddt_histogram_t));
+
+               ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+               ddt_get_dedup_object_stats(spa, ddo);
+               VERIFY(nvlist_add_uint64_array(config,
+                   ZPOOL_CONFIG_DDT_OBJ_STATS,
+                   (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0);
+               kmem_free(ddo, sizeof (ddt_object_t));
+
+               dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+               ddt_get_dedup_stats(spa, dds);
+               VERIFY(nvlist_add_uint64_array(config,
+                   ZPOOL_CONFIG_DDT_STATS,
+                   (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0);
+               kmem_free(dds, sizeof (ddt_stat_t));
+       }
+
+       spa_rewind_data_to_nvlist(spa, config);
+
        if (locked)
                spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
index ac0a20a..282140b 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 #include <sys/zap.h>
 #include <sys/zio.h>
 
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-#ifdef _KERNEL
-uint64_t
-strtonum(const char *str, char **nptr)
-{
-       uint64_t val = 0;
-       char c;
-       int digit;
-
-       while ((c = *str) != '\0') {
-               if (c >= '0' && c <= '9')
-                       digit = c - '0';
-               else if (c >= 'a' && c <= 'f')
-                       digit = 10 + c - 'a';
-               else
-                       break;
-
-               val *= 16;
-               val += digit;
-
-               str++;
-       }
-
-       if (nptr)
-               *nptr = (char *)str;
-
-       return (val);
-}
-#endif
 
 /*
  * Convert a bookmark to a string.
@@ -134,7 +101,7 @@ spa_log_error(spa_t *spa, zio_t *zio)
         * If we are trying to import a pool, ignore any errors, as we won't be
         * writing to the pool any time soon.
         */
-       if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+       if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
                return;
 
        mutex_enter(&spa->spa_errlist_lock);
index b77ac42..212abae 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -33,6 +32,7 @@
 #include <sys/utsname.h>
 #include <sys/cmn_err.h>
 #include <sys/sunddi.h>
+#include "zfs_comutil.h"
 #ifdef _KERNEL
 #include <sys/zone.h>
 #endif
@@ -103,7 +103,8 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
         * Figure out maximum size of history log.  We set it at
         * 1% of pool size, with a max of 32MB and min of 128KB.
         */
-       shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+       shpp->sh_phys_max_off =
+           metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
        shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
        shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
 
@@ -186,8 +187,9 @@ spa_history_zone()
 /*
  * Write out a history event.
  */
+/*ARGSUSED*/
 static void
-spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        spa_t           *spa = arg1;
        history_arg_t   *hap = arg2;
@@ -230,9 +232,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
        VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
            gethrestime_sec()) == 0);
-       VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO,
-           (uint64_t)crgetuid(cr)) == 0);
-       if (hap->ha_zone[0] != '\0')
+       VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0);
+       if (hap->ha_zone != NULL)
                VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
                    hap->ha_zone) == 0);
 #ifdef _KERNEL
@@ -243,6 +244,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
            hap->ha_log_type == LOG_CMD_NORMAL) {
                VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
                    history_str) == 0);
+
+               zfs_dbgmsg("command: %s", history_str);
        } else {
                VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
                    hap->ha_event) == 0);
@@ -250,6 +253,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                    tx->tx_txg) == 0);
                VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
                    history_str) == 0);
+
+               zfs_dbgmsg("internal %s pool:%s txg:%llu %s",
+                   zfs_history_event_names[hap->ha_event], spa_name(spa),
+                   (longlong_t)tx->tx_txg, history_str);
+
        }
 
        VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
@@ -278,10 +286,10 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        kmem_free(record_packed, reclen);
        dmu_buf_rele(dbp, FTAG);
 
-       if (hap->ha_log_type == LOG_INTERNAL) {
-               kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN);
-               kmem_free(hap, sizeof (history_arg_t));
-       }
+       strfree(hap->ha_history_str);
+       if (hap->ha_zone != NULL)
+               strfree(hap->ha_zone);
+       kmem_free(hap, sizeof (history_arg_t));
 }
 
 /*
@@ -290,15 +298,32 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 int
 spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
 {
-       history_arg_t ha;
+       history_arg_t *ha;
+       int err = 0;
+       dmu_tx_t *tx;
 
        ASSERT(what != LOG_INTERNAL);
 
-       ha.ha_history_str = history_str;
-       ha.ha_log_type = what;
-       (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone));
-       return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
-           spa, &ha, 0));
+       tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       err = dmu_tx_assign(tx, TXG_WAIT);
+       if (err) {
+               dmu_tx_abort(tx);
+               return (err);
+       }
+
+       ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+       ha->ha_history_str = strdup(history_str);
+       ha->ha_zone = strdup(spa_history_zone());
+       ha->ha_log_type = what;
+       ha->ha_uid = crgetuid(CRED());
+
+       /* Kick this off asynchronously; errors are ignored. */
+       dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
+           spa_history_log_sync, spa, ha, 0, tx);
+       dmu_tx_commit(tx);
+
+       /* spa_history_log_sync will free ha and strings */
+       return (err);
 }
 
 /*
@@ -321,6 +346,14 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
        if (!spa->spa_history)
                return (ENOENT);
 
+       /*
+        * The history is logged asynchronously, so when they request
+        * the first chunk of history, make sure everything has been
+        * synced to disk so that we get it.
+        */
+       if (*offp == 0 && spa_writeable(spa))
+               txg_wait_synced(spa_get_dsl(spa), 0);
+
        if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
                return (err);
        shpp = dbp->db_data;
@@ -392,10 +425,9 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 
 static void
 log_internal(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
+    dmu_tx_t *tx, const char *fmt, va_list adx)
 {
-       history_arg_t *hap;
-       char *str;
+       history_arg_t *ha;
 
        /*
         * If this is part of creating a pool, not everything is
@@ -404,28 +436,29 @@ log_internal(history_internal_events_t event, spa_t *spa,
        if (tx->tx_txg == TXG_INITIAL)
                return;
 
-       hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
-       str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+       ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+       ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1,
+           KM_SLEEP);
 
-       (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
+       (void) vsprintf(ha->ha_history_str, fmt, adx);
 
-       hap->ha_log_type = LOG_INTERNAL;
-       hap->ha_history_str = str;
-       hap->ha_event = event;
-       hap->ha_zone[0] = '\0';
+       ha->ha_log_type = LOG_INTERNAL;
+       ha->ha_event = event;
+       ha->ha_zone = NULL;
+       ha->ha_uid = 0;
 
        if (dmu_tx_is_syncing(tx)) {
-               spa_history_log_sync(spa, hap, cr, tx);
+               spa_history_log_sync(spa, ha, tx);
        } else {
                dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
-                   spa_history_log_sync, spa, hap, 0, tx);
+                   spa_history_log_sync, spa, ha, 0, tx);
        }
-       /* spa_history_log_sync() will free hap and str */
+       /* spa_history_log_sync() will free ha and strings */
 }
 
 void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+spa_history_log_internal(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, const char *fmt, ...)
 {
        dmu_tx_t *htx = tx;
        va_list adx;
@@ -440,7 +473,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
        }
 
        va_start(adx, fmt);
-       log_internal(event, spa, htx, cr, fmt, adx);
+       log_internal(event, spa, htx, fmt, adx);
        va_end(adx);
 
        /* if we didn't get a tx from the caller, commit the one we made */
@@ -455,7 +488,7 @@ spa_history_log_version(spa_t *spa, history_internal_events_t event)
        uint64_t current_vers = spa_version(spa);
 
        if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
-               spa_history_internal_log(event, spa, NULL, CRED(),
+               spa_history_log_internal(event, spa, NULL,
                    "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
                    (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
                    utsname.nodename, utsname.release, utsname.version,
index 8150ac9..52af7fc 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
-#include <sys/sunddi.h>
 #include <sys/arc.h>
+#include <sys/ddt.h>
 #include "zfs_prop.h"
 
 /*
  *
  * SCL_VDEV
  *     Held as reader to prevent changes to the vdev tree during trivial
- *     inquiries such as bp_get_dasize().  SCL_VDEV is distinct from the
+ *     inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
  *     other locks, and lower than all of them, to ensure that it's safe
  *     to acquire regardless of caller context.
  *
@@ -420,7 +420,7 @@ spa_lookup(const char *name)
  * exist by calling spa_lookup() first.
  */
 spa_t *
-spa_add(const char *name, const char *altroot)
+spa_add(const char *name, nvlist_t *config, const char *altroot)
 {
        spa_t *spa;
        spa_config_dirent_t *dp;
@@ -430,29 +430,36 @@ spa_add(const char *name, const char *altroot)
        spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
        mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 
        cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
+       for (int t = 0; t < TXG_SIZE; t++)
+               bplist_create(&spa->spa_free_bplist[t]);
+
        (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
        spa->spa_state = POOL_STATE_UNINITIALIZED;
        spa->spa_freeze_txg = UINT64_MAX;
        spa->spa_final_txg = UINT64_MAX;
+       spa->spa_load_max_txg = UINT64_MAX;
+       spa->spa_proc = &p0;
+       spa->spa_proc_state = SPA_PROC_NONE;
 
        refcount_create(&spa->spa_refcount);
        spa_config_lock_init(spa);
 
        avl_add(&spa_namespace_avl, spa);
 
-       mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
-
        /*
         * Set the alternate root, if there is one.
         */
@@ -468,9 +475,12 @@ spa_add(const char *name, const char *altroot)
            offsetof(spa_config_dirent_t, scd_link));
 
        dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
-       dp->scd_path = spa_strdup(spa_config_path);
+       dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
        list_insert_head(&spa->spa_config_list, dp);
 
+       if (config != NULL)
+               VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
        return (spa);
 }
 
@@ -487,6 +497,8 @@ spa_remove(spa_t *spa)
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
        ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
+       nvlist_free(spa->spa_config_splitting);
+
        avl_remove(&spa_namespace_avl, spa);
        cv_broadcast(&spa_namespace_cv);
 
@@ -510,18 +522,23 @@ spa_remove(spa_t *spa)
 
        spa_config_lock_destroy(spa);
 
+       for (int t = 0; t < TXG_SIZE; t++)
+               bplist_destroy(&spa->spa_free_bplist[t]);
+
        cv_destroy(&spa->spa_async_cv);
+       cv_destroy(&spa->spa_proc_cv);
        cv_destroy(&spa->spa_scrub_io_cv);
        cv_destroy(&spa->spa_suspend_cv);
 
        mutex_destroy(&spa->spa_async_lock);
-       mutex_destroy(&spa->spa_scrub_lock);
-       mutex_destroy(&spa->spa_errlog_lock);
        mutex_destroy(&spa->spa_errlist_lock);
-       mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+       mutex_destroy(&spa->spa_errlog_lock);
        mutex_destroy(&spa->spa_history_lock);
+       mutex_destroy(&spa->spa_proc_lock);
        mutex_destroy(&spa->spa_props_lock);
+       mutex_destroy(&spa->spa_scrub_lock);
        mutex_destroy(&spa->spa_suspend_lock);
+       mutex_destroy(&spa->spa_vdev_top_lock);
 
        kmem_free(spa, sizeof (spa_t));
 }
@@ -815,12 +832,6 @@ spa_l2cache_activate(vdev_t *vd)
        mutex_exit(&spa_l2cache_lock);
 }
 
-void
-spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
-{
-       vdev_space_update(vd, space, alloc, B_FALSE);
-}
-
 /*
  * ==========================================================================
  * SPA vdev locking
@@ -835,7 +846,20 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
+       mutex_enter(&spa->spa_vdev_top_lock);
        mutex_enter(&spa_namespace_lock);
+       return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter().  Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
        spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
@@ -843,14 +867,14 @@ spa_vdev_enter(spa_t *spa)
 }
 
 /*
- * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
  */
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 {
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
        int config_changed = B_FALSE;
 
        ASSERT(txg > spa_last_synced_txg(spa));
@@ -863,16 +887,31 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
        vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
 
        /*
-        * If the config changed, notify the scrub thread that it must restart.
+        * If the config changed, notify the scrub that it must restart.
+        * This will initiate a resilver if needed.
         */
        if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
-               dsl_pool_scrub_restart(spa->spa_dsl_pool);
                config_changed = B_TRUE;
+               spa->spa_config_generation++;
        }
 
+       /*
+        * Verify the metaslab classes.
+        */
+       ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+       ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+
        spa_config_exit(spa, SCL_ALL, spa);
 
        /*
+        * Panic the system if the specified tag requires it.  This
+        * is useful for ensuring that configurations are updated
+        * transactionally.
+        */
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, tag, 0);
+
+       /*
         * Note: this txg_wait_synced() is important because it ensures
         * that there won't be more than one config change per txg.
         * This allows us to use the txg as the generation number.
@@ -892,8 +931,20 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
         */
        if (config_changed)
                spa_config_sync(spa, B_FALSE, B_TRUE);
+}
 
+/*
+ * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+       spa_vdev_config_exit(spa, vd, txg, error, FTAG);
        mutex_exit(&spa_namespace_lock);
+       mutex_exit(&spa->spa_vdev_top_lock);
 
        return (error);
 }
@@ -902,18 +953,52 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
  * Lock the given spa_t for the purpose of changing vdev state.
  */
 void
-spa_vdev_state_enter(spa_t *spa)
+spa_vdev_state_enter(spa_t *spa, int oplocks)
 {
-       spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+       int locks = SCL_STATE_ALL | oplocks;
+
+       /*
+        * Root pools may need to read of the underlying devfs filesystem
+        * when opening up a vdev.  Unfortunately if we're holding the
+        * SCL_ZIO lock it will result in a deadlock when we try to issue
+        * the read from the root filesystem.  Instead we "prefetch"
+        * the associated vnodes that we need prior to opening the
+        * underlying devices and cache them so that we can prevent
+        * any I/O when we are doing the actual open.
+        */
+       if (spa_is_root(spa)) {
+               int low = locks & ~(SCL_ZIO - 1);
+               int high = locks & ~low;
+
+               spa_config_enter(spa, high, spa, RW_WRITER);
+               vdev_hold(spa->spa_root_vdev);
+               spa_config_enter(spa, low, spa, RW_WRITER);
+       } else {
+               spa_config_enter(spa, locks, spa, RW_WRITER);
+       }
+       spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
-       if (vd != NULL)
+       boolean_t config_changed = B_FALSE;
+
+       if (vd != NULL || error == 0)
+               vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+                   0, 0, B_FALSE);
+
+       if (vd != NULL) {
                vdev_state_dirty(vd->vdev_top);
+               config_changed = B_TRUE;
+               spa->spa_config_generation++;
+       }
 
-       spa_config_exit(spa, SCL_STATE_ALL, spa);
+       if (spa_is_root(spa))
+               vdev_rele(spa->spa_root_vdev);
+
+       ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+       spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
        /*
         * If anything changed, wait for it to sync.  This ensures that,
@@ -924,6 +1009,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
        if (vd != NULL)
                txg_wait_synced(spa->spa_dsl_pool, 0);
 
+       /*
+        * If the config changed, update the config cache.
+        */
+       if (config_changed) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_sync(spa, B_FALSE, B_TRUE);
+               mutex_exit(&spa_namespace_lock);
+       }
+
        return (error);
 }
 
@@ -983,7 +1077,6 @@ spa_rename(const char *name, const char *newname)
        return (0);
 }
 
-
 /*
  * Determine whether a pool with given pool_guid exists.  If device_guid is
  * non-zero, determine whether the pool exists *and* contains a device with the
@@ -1056,48 +1149,36 @@ spa_get_random(uint64_t range)
        return (r % range);
 }
 
-void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+uint64_t
+spa_generate_guid(spa_t *spa)
 {
-       int d;
+       uint64_t guid = spa_get_random(-1ULL);
 
-       if (bp == NULL) {
-               (void) snprintf(buf, len, "<NULL>");
-               return;
+       if (spa != NULL) {
+               while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+                       guid = spa_get_random(-1ULL);
+       } else {
+               while (guid == 0 || spa_guid_exists(guid, 0))
+                       guid = spa_get_random(-1ULL);
        }
 
-       if (BP_IS_HOLE(bp)) {
-               (void) snprintf(buf, len, "<hole>");
-               return;
-       }
+       return (guid);
+}
+
+void
+sprintf_blkptr(char *buf, const blkptr_t *bp)
+{
+       char *type = NULL;
+       char *checksum = NULL;
+       char *compress = NULL;
 
-       (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
-           (u_longlong_t)BP_GET_LEVEL(bp),
-           dmu_ot[BP_GET_TYPE(bp)].ot_name,
-           (u_longlong_t)BP_GET_LSIZE(bp),
-           (u_longlong_t)BP_GET_PSIZE(bp));
-
-       for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-               const dva_t *dva = &bp->blk_dva[d];
-               (void) snprintf(buf + strlen(buf), len - strlen(buf),
-                   "DVA[%d]=<%llu:%llx:%llx> ", d,
-                   (u_longlong_t)DVA_GET_VDEV(dva),
-                   (u_longlong_t)DVA_GET_OFFSET(dva),
-                   (u_longlong_t)DVA_GET_ASIZE(dva));
+       if (bp != NULL) {
+               type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+               checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+               compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
        }
 
-       (void) snprintf(buf + strlen(buf), len - strlen(buf),
-           "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
-           zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-           zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
-           BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
-           BP_IS_GANG(bp) ? "gang" : "contiguous",
-           (u_longlong_t)bp->blk_birth,
-           (u_longlong_t)bp->blk_fill,
-           (u_longlong_t)bp->blk_cksum.zc_word[0],
-           (u_longlong_t)bp->blk_cksum.zc_word[1],
-           (u_longlong_t)bp->blk_cksum.zc_word[2],
-           (u_longlong_t)bp->blk_cksum.zc_word[3]);
+       SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
 }
 
 void
@@ -1126,6 +1207,37 @@ zfs_panic_recover(const char *fmt, ...)
 }
 
 /*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+uint64_t
+strtonum(const char *str, char **nptr)
+{
+       uint64_t val = 0;
+       char c;
+       int digit;
+
+       while ((c = *str) != '\0') {
+               if (c >= '0' && c <= '9')
+                       digit = c - '0';
+               else if (c >= 'a' && c <= 'f')
+                       digit = 10 + c - 'a';
+               else
+                       break;
+
+               val *= 16;
+               val += digit;
+
+               str++;
+       }
+
+       if (nptr)
+               *nptr = (char *)str;
+
+       return (val);
+}
+
+/*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
@@ -1203,59 +1315,55 @@ spa_first_txg(spa_t *spa)
        return (spa->spa_first_txg);
 }
 
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+       return (spa->spa_syncing_txg);
+}
+
 pool_state_t
 spa_state(spa_t *spa)
 {
        return (spa->spa_state);
 }
 
-uint64_t
-spa_freeze_txg(spa_t *spa)
+spa_load_state_t
+spa_load_state(spa_t *spa)
 {
-       return (spa->spa_freeze_txg);
+       return (spa->spa_load_state);
 }
 
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
 uint64_t
-spa_get_alloc(spa_t *spa)
+spa_freeze_txg(spa_t *spa)
 {
-       return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+       return (spa->spa_freeze_txg);
 }
 
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
+/* ARGSUSED */
 uint64_t
-spa_get_space(spa_t *spa)
+spa_get_asize(spa_t *spa, uint64_t lsize)
 {
-       return (spa->spa_root_vdev->vdev_stat.vs_space);
+       /*
+        * The worst case is single-sector max-parity RAID-Z blocks, in which
+        * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+        * times the size; so just assume that.  Add to this the fact that
+        * we can have up to 3 DVAs per bp, and one more factor of 2 because
+        * the block may be dittoed with up to 3 DVAs by ddt_sync().
+        */
+       return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
 }
 
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
 uint64_t
 spa_get_dspace(spa_t *spa)
 {
-       if (spa->spa_deflate)
-               return (spa->spa_root_vdev->vdev_stat.vs_dspace);
-       else
-               return (spa->spa_root_vdev->vdev_stat.vs_space);
+       return (spa->spa_dspace);
 }
 
-/* ARGSUSED */
-uint64_t
-spa_get_asize(spa_t *spa, uint64_t lsize)
+void
+spa_update_dspace(spa_t *spa)
 {
-       /*
-        * For now, the worst case is 512-byte RAID-Z blocks, in which
-        * case the space requirement is exactly 2x; so just assume that.
-        * Add to this the fact that we can have up to 3 DVAs per bp, and
-        * we have to multiply by a total of 6x.
-        */
-       return (lsize * 6);
+       spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+           ddt_get_dedup_dspace(spa);
 }
 
 /*
@@ -1280,6 +1388,24 @@ spa_version(spa_t *spa)
        return (spa->spa_ubsync.ub_version);
 }
 
+boolean_t
+spa_deflate(spa_t *spa)
+{
+       return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+       return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+       return (spa->spa_log_class);
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
@@ -1293,24 +1419,52 @@ spa_max_replication(spa_t *spa)
        return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
+int
+spa_prev_software_version(spa_t *spa)
+{
+       return (spa->spa_prev_software_version);
+}
+
 uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
-       int sz = 0, i;
+       uint64_t asize = DVA_GET_ASIZE(dva);
+       uint64_t dsize = asize;
 
-       if (!spa->spa_deflate)
-               return (BP_GET_ASIZE(bp));
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-       for (i = 0; i < SPA_DVAS_PER_BP; i++) {
-               vdev_t *vd =
-                   vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-               if (vd)
-                       sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
-                           SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+       if (asize != 0 && spa->spa_deflate) {
+               vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+               dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
        }
+
+       return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+       uint64_t dsize = 0;
+
+       for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+               dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+       return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+       uint64_t dsize = 0;
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+               dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
        spa_config_exit(spa, SCL_VDEV, FTAG);
-       return (sz);
+
+       return (dsize);
 }
 
 /*
@@ -1412,9 +1566,18 @@ spa_has_slogs(spa_t *spa)
        return (spa->spa_log_class->mc_rotor != NULL);
 }
 
-/*
- * Return whether this pool is the root pool.
- */
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+       return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+       spa->spa_log_state = state;
+}
+
 boolean_t
 spa_is_root(spa_t *spa)
 {
@@ -1432,3 +1595,69 @@ spa_mode(spa_t *spa)
 {
        return (spa->spa_mode);
 }
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+       return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+       return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+       return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+       return (spa->spa_dedup_checksum);
+}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+       /* data not stored on disk */
+       spa->spa_scan_pass_start = gethrestime_sec();
+       spa->spa_scan_pass_exam = 0;
+       vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+       dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+       if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+               return (ENOENT);
+       bzero(ps, sizeof (pool_scan_stat_t));
+
+       /* data stored on disk */
+       ps->pss_func = scn->scn_phys.scn_func;
+       ps->pss_start_time = scn->scn_phys.scn_start_time;
+       ps->pss_end_time = scn->scn_phys.scn_end_time;
+       ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+       ps->pss_examined = scn->scn_phys.scn_examined;
+       ps->pss_to_process = scn->scn_phys.scn_to_process;
+       ps->pss_processed = scn->scn_phys.scn_processed;
+       ps->pss_errors = scn->scn_phys.scn_errors;
+       ps->pss_state = scn->scn_phys.scn_state;
+
+       /* data not stored on disk */
+       ps->pss_pass_start = spa->spa_scan_pass_start;
+       ps->pss_pass_exam = spa->spa_scan_pass_exam;
+
+       return (0);
+}
index 75b55d5..1ce7b2a 100644 (file)
@@ -258,8 +258,10 @@ space_map_load_wait(space_map_t *sm)
 {
        ASSERT(MUTEX_HELD(sm->sm_lock));
 
-       while (sm->sm_loading)
+       while (sm->sm_loading) {
+               ASSERT(!sm->sm_loaded);
                cv_wait(&sm->sm_load_cv, sm->sm_lock);
+       }
 }
 
 /*
@@ -276,11 +278,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
        int error = 0;
 
        ASSERT(MUTEX_HELD(sm->sm_lock));
-
-       space_map_load_wait(sm);
-
-       if (sm->sm_loaded)
-               return (0);
+       ASSERT(!sm->sm_loaded);
+       ASSERT(!sm->sm_loading);
 
        sm->sm_loading = B_TRUE;
        end = smo->smo_objsize;
@@ -368,10 +367,8 @@ space_map_unload(space_map_t *sm)
 uint64_t
 space_map_maxsize(space_map_t *sm)
 {
-       if (sm->sm_loaded && sm->sm_ops != NULL)
-               return (sm->sm_ops->smop_max(sm));
-       else
-               return (-1ULL);
+       ASSERT(sm->sm_ops != NULL);
+       return (sm->sm_ops->smop_max(sm));
 }
 
 uint64_t
index e3c0e2a..f478ad0 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
 #include <sys/callb.h>
 
 /*
@@ -57,10 +58,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
                for (i = 0; i < TXG_SIZE; i++) {
                        cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
                            NULL);
+                       list_create(&tx->tx_cpu[c].tc_callbacks[i],
+                           sizeof (dmu_tx_callback_t),
+                           offsetof(dmu_tx_callback_t, dcb_node));
                }
        }
 
-       rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
        mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 
        cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
@@ -83,7 +86,6 @@ txg_fini(dsl_pool_t *dp)
 
        ASSERT(tx->tx_threads == 0);
 
-       rw_destroy(&tx->tx_suspend);
        mutex_destroy(&tx->tx_sync_lock);
 
        cv_destroy(&tx->tx_sync_more_cv);
@@ -96,10 +98,15 @@ txg_fini(dsl_pool_t *dp)
                int i;
 
                mutex_destroy(&tx->tx_cpu[c].tc_lock);
-               for (i = 0; i < TXG_SIZE; i++)
+               for (i = 0; i < TXG_SIZE; i++) {
                        cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+                       list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+               }
        }
 
+       if (tx->tx_commit_cb_taskq != NULL)
+               taskq_destroy(tx->tx_commit_cb_taskq);
+
        kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 
        bzero(tx, sizeof (tx_state_t));
@@ -129,7 +136,7 @@ txg_sync_start(dsl_pool_t *dp)
         * 32-bit x86.  This is due in part to nested pools and
         * scrub_visitbp() recursion.
         */
-       tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread,
+       tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
            dp, 0, &p0, TS_RUN, minclsyspri);
 
        mutex_exit(&tx->tx_sync_lock);
@@ -159,7 +166,8 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
        CALLB_CPR_SAFE_BEGIN(cpr);
 
        if (time)
-               (void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + time);
+               (void) cv_timedwait(cv, &tx->tx_sync_lock,
+                   ddi_get_lbolt() + time);
        else
                cv_wait(cv, &tx->tx_sync_lock);
 
@@ -179,7 +187,11 @@ txg_sync_stop(dsl_pool_t *dp)
         * Finish off any work in progress.
         */
        ASSERT(tx->tx_threads == 2);
-       txg_wait_synced(dp, 0);
+
+       /*
+        * We need to ensure that we've vacated the deferred space_maps.
+        */
+       txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 
        /*
         * Wake all sync threads and wait for them to die.
@@ -229,6 +241,17 @@ txg_rele_to_quiesce(txg_handle_t *th)
 }
 
 void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+       tx_cpu_t *tc = th->th_cpu;
+       int g = th->th_txg & TXG_MASK;
+
+       mutex_enter(&tc->tc_lock);
+       list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+       mutex_exit(&tc->tc_lock);
+}
+
+void
 txg_rele_to_sync(txg_handle_t *th)
 {
        tx_cpu_t *tc = th->th_cpu;
@@ -279,8 +302,58 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 }
 
 static void
+txg_do_callbacks(list_t *cb_list)
+{
+       dmu_tx_do_callbacks(cb_list, 0);
+
+       list_destroy(cb_list);
+
+       kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+       int c;
+       tx_state_t *tx = &dp->dp_tx;
+       list_t *cb_list;
+
+       for (c = 0; c < max_ncpus; c++) {
+               tx_cpu_t *tc = &tx->tx_cpu[c];
+               /* No need to lock tx_cpu_t at this point */
+
+               int g = txg & TXG_MASK;
+
+               if (list_is_empty(&tc->tc_callbacks[g]))
+                       continue;
+
+               if (tx->tx_commit_cb_taskq == NULL) {
+                       /*
+                        * Commit callback taskq hasn't been created yet.
+                        */
+                       tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+                           max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
+                           TASKQ_PREPOPULATE);
+               }
+
+               cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+               list_create(cb_list, sizeof (dmu_tx_callback_t),
+                   offsetof(dmu_tx_callback_t, dcb_node));
+
+               list_move_tail(&tc->tc_callbacks[g], cb_list);
+
+               (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+                   txg_do_callbacks, cb_list, TQ_SLEEP);
+       }
+}
+
+static void
 txg_sync_thread(dsl_pool_t *dp)
 {
+       spa_t *spa = dp->dp_spa;
        tx_state_t *tx = &dp->dp_tx;
        callb_cpr_t cpr;
        uint64_t start, delta;
@@ -293,20 +366,19 @@ txg_sync_thread(dsl_pool_t *dp)
                uint64_t txg;
 
                /*
-                * We sync when we're scrubbing, there's someone waiting
+                * We sync when we're scanning, there's someone waiting
                 * on us, or the quiesce thread has handed off a txg to
                 * us, or we have reached our timeout.
                 */
                timer = (delta >= timeout ? 0 : timeout - delta);
-               while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
-                   spa_shutting_down(dp->dp_spa)) &&
+               while (!dsl_scan_active(dp->dp_scan) &&
                    !tx->tx_exiting && timer > 0 &&
                    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
                    tx->tx_quiesced_txg == 0) {
                        dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
                            tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
                        txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
-                       delta = lbolt - start;
+                       delta = ddi_get_lbolt() - start;
                        timer = (delta > timeout ? 0 : timeout - delta);
                }
 
@@ -324,8 +396,6 @@ txg_sync_thread(dsl_pool_t *dp)
                if (tx->tx_exiting)
                        txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 
-               rw_enter(&tx->tx_suspend, RW_WRITER);
-
                /*
                 * Consume the quiesced txg which has been handed off to
                 * us.  This may cause the quiescing thread to now be
@@ -335,22 +405,24 @@ txg_sync_thread(dsl_pool_t *dp)
                tx->tx_quiesced_txg = 0;
                tx->tx_syncing_txg = txg;
                cv_broadcast(&tx->tx_quiesce_more_cv);
-               rw_exit(&tx->tx_suspend);
 
                dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
                    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
                mutex_exit(&tx->tx_sync_lock);
 
-               start = lbolt;
-               spa_sync(dp->dp_spa, txg);
-               delta = lbolt - start;
+               start = ddi_get_lbolt();
+               spa_sync(spa, txg);
+               delta = ddi_get_lbolt() - start;
 
                mutex_enter(&tx->tx_sync_lock);
-               rw_enter(&tx->tx_suspend, RW_WRITER);
                tx->tx_synced_txg = txg;
                tx->tx_syncing_txg = 0;
-               rw_exit(&tx->tx_suspend);
                cv_broadcast(&tx->tx_sync_done_cv);
+
+               /*
+                * Dispatch commit callbacks to worker threads.
+                */
+               txg_dispatch_callbacks(dp, txg);
        }
 }
 
@@ -407,7 +479,7 @@ void
 txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
 {
        tx_state_t *tx = &dp->dp_tx;
-       int timeout = lbolt + ticks;
+       int timeout = ddi_get_lbolt() + ticks;
 
        /* don't delay if this txg could transition to quiesing immediately */
        if (tx->tx_open_txg > txg ||
@@ -420,7 +492,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
                return;
        }
 
-       while (lbolt < timeout &&
+       while (ddi_get_lbolt() < timeout &&
            tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
                (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
                    timeout);
@@ -436,7 +508,7 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
        mutex_enter(&tx->tx_sync_lock);
        ASSERT(tx->tx_threads == 2);
        if (txg == 0)
-               txg = tx->tx_open_txg;
+               txg = tx->tx_open_txg + TXG_DEFER_SIZE;
        if (tx->tx_sync_txg_waiting < txg)
                tx->tx_sync_txg_waiting = txg;
        dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -487,21 +559,6 @@ txg_sync_waiting(dsl_pool_t *dp)
            tx->tx_quiesced_txg != 0);
 }
 
-void
-txg_suspend(dsl_pool_t *dp)
-{
-       tx_state_t *tx = &dp->dp_tx;
-       /* XXX some code paths suspend when they are already suspended! */
-       rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
-       tx_state_t *tx = &dp->dp_tx;
-       rw_exit(&tx->tx_suspend);
-}
-
 /*
  * Per-txg object lists.
  */
@@ -559,6 +616,34 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 }
 
 /*
+ * Add an entry to the end of the list (walks list to find end).
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+       int t = txg & TXG_MASK;
+       txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+       int already_on_list;
+
+       mutex_enter(&tl->tl_lock);
+       already_on_list = tn->tn_member[t];
+       if (!already_on_list) {
+               txg_node_t **tp;
+
+               for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+                       continue;
+
+               tn->tn_member[t] = 1;
+               tn->tn_next[t] = NULL;
+               *tp = tn;
+       }
+       mutex_exit(&tl->tl_lock);
+
+       return (already_on_list);
+}
+
+/*
  * Remove the head of the list and return it.
  */
 void *
index 34d7e0c..692cda1 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_impl.h>
@@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
        ub->ub_txg = txg;
        ub->ub_guid_sum = rvd->vdev_guid_sum;
        ub->ub_timestamp = gethrestime_sec();
+       ub->ub_software_version = SPA_VERSION;
 
        return (ub->ub_rootbp.blk_birth == txg);
 }
index bb5024f..a61f29b 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -40,6 +39,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
+#include <sys/dsl_scan.h>
 
 /*
  * Virtual device management.
@@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
        &vdev_disk_ops,
        &vdev_file_ops,
        &vdev_missing_ops,
+       &vdev_hole_ops,
        NULL
 };
 
@@ -281,7 +282,7 @@ vdev_compact_children(vdev_t *pvd)
 /*
  * Allocate and minimally initialize a vdev_t.
  */
-static vdev_t *
+vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
        vdev_t *vd;
@@ -293,21 +294,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
                spa->spa_root_vdev = vd;
        }
 
-       if (guid == 0) {
+       if (guid == 0 && ops != &vdev_hole_ops) {
                if (spa->spa_root_vdev == vd) {
                        /*
                         * The root vdev's guid will also be the pool guid,
                         * which must be unique among all pools.
                         */
-                       while (guid == 0 || spa_guid_exists(guid, 0))
-                               guid = spa_get_random(-1ULL);
+                       guid = spa_generate_guid(NULL);
                } else {
                        /*
                         * Any other vdev's guid must be unique within the pool.
                         */
-                       while (guid == 0 ||
-                           spa_guid_exists(spa_guid(spa), guid))
-                               guid = spa_get_random(-1ULL);
+                       guid = spa_generate_guid(spa);
                }
                ASSERT(!spa_guid_exists(spa_guid(spa), guid));
        }
@@ -318,6 +316,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        vd->vdev_guid_sum = guid;
        vd->vdev_ops = ops;
        vd->vdev_state = VDEV_STATE_CLOSED;
+       vd->vdev_ishole = (ops == &vdev_hole_ops);
 
        mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -397,6 +396,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
        if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
                return (ENOTSUP);
 
+       if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+               return (ENOTSUP);
+
        /*
         * Set the nparity property for RAID-Z vdevs.
         */
@@ -404,10 +406,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
        if (ops == &vdev_raidz_ops) {
                if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
                    &nparity) == 0) {
-                       /*
-                        * Currently, we can only support 3 parity devices.
-                        */
-                       if (nparity == 0 || nparity > 3)
+                       if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
                                return (EINVAL);
                        /*
                         * Previous versions could only support 1 or 2 parity
@@ -472,15 +471,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
        /*
+        * Retrieve the vdev creation time.
+        */
+       (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+           &vd->vdev_crtxg);
+
+       /*
         * If we're a top-level vdev, try to load the allocation parameters.
         */
-       if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+       if (parent && !parent->vdev_parent &&
+           (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
                    &vd->vdev_ms_array);
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
                    &vd->vdev_ms_shift);
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
                    &vd->vdev_asize);
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+                   &vd->vdev_removing);
+       }
+
+       if (parent && !parent->vdev_parent) {
+               ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+                   alloctype == VDEV_ALLOC_ADD ||
+                   alloctype == VDEV_ALLOC_SPLIT ||
+                   alloctype == VDEV_ALLOC_ROOTPOOL);
+               vd->vdev_mg = metaslab_group_create(islog ?
+                   spa_log_class(spa) : spa_normal_class(spa), vd);
        }
 
        /*
@@ -510,15 +527,27 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                /*
                 * When importing a pool, we want to ignore the persistent fault
                 * state, as the diagnosis made on another system may not be
-                * valid in the current context.
+                * valid in the current context.  Local vdevs will
+                * remain in the faulted state.
                 */
-               if (spa->spa_load_state == SPA_LOAD_OPEN) {
+               if (spa_load_state(spa) == SPA_LOAD_OPEN) {
                        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
                            &vd->vdev_faulted);
                        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
                            &vd->vdev_degraded);
                        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
                            &vd->vdev_removed);
+
+                       if (vd->vdev_faulted || vd->vdev_degraded) {
+                               char *aux;
+
+                               vd->vdev_label_aux =
+                                   VDEV_AUX_ERR_EXCEEDED;
+                               if (nvlist_lookup_string(nv,
+                                   ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+                                   strcmp(aux, "external") == 0)
+                                       vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+                       }
                }
        }
 
@@ -544,6 +573,7 @@ vdev_free(vdev_t *vd)
        vdev_close(vd);
 
        ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+       ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
        /*
         * Free all children.
@@ -557,8 +587,10 @@ vdev_free(vdev_t *vd)
        /*
         * Discard allocation state.
         */
-       if (vd == vd->vdev_top)
+       if (vd->vdev_mg != NULL) {
                vdev_metaslab_fini(vd);
+               metaslab_group_destroy(vd->vdev_mg);
+       }
 
        ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
        ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -705,6 +737,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
        mvd->vdev_min_asize = cvd->vdev_min_asize;
        mvd->vdev_ashift = cvd->vdev_ashift;
        mvd->vdev_state = cvd->vdev_state;
+       mvd->vdev_crtxg = cvd->vdev_crtxg;
 
        vdev_remove_child(pvd, cvd);
        vdev_add_child(pvd, mvd);
@@ -746,6 +779,7 @@ vdev_remove_parent(vdev_t *cvd)
         */
        if (mvd->vdev_top == mvd) {
                uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+               cvd->vdev_orig_guid = cvd->vdev_guid;
                cvd->vdev_guid += guid_delta;
                cvd->vdev_guid_sum += guid_delta;
        }
@@ -765,16 +799,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
        spa_t *spa = vd->vdev_spa;
        objset_t *mos = spa->spa_meta_objset;
-       metaslab_class_t *mc;
        uint64_t m;
        uint64_t oldc = vd->vdev_ms_count;
        uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
        metaslab_t **mspp;
        int error;
 
-       if (vd->vdev_ms_shift == 0)     /* not being allocated from yet */
+       ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+       /*
+        * This vdev is not being allocated from yet or is a hole.
+        */
+       if (vd->vdev_ms_shift == 0)
                return (0);
 
+       ASSERT(!vd->vdev_ishole);
+
        /*
         * Compute the raidz-deflation ratio.  Note, we hard-code
         * in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -786,14 +826,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
        ASSERT(oldc <= newc);
 
-       if (vd->vdev_islog)
-               mc = spa->spa_log_class;
-       else
-               mc = spa->spa_normal_class;
-
-       if (vd->vdev_mg == NULL)
-               vd->vdev_mg = metaslab_group_create(mc, vd);
-
        mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
        if (oldc != 0) {
@@ -828,6 +860,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
        }
 
+       if (txg == 0)
+               spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+       /*
+        * If the vdev is being removed we don't activate
+        * the metaslabs since we want to ensure that no new
+        * allocations are performed on this device.
+        */
+       if (oldc == 0 && !vd->vdev_removing)
+               metaslab_group_activate(vd->vdev_mg);
+
+       if (txg == 0)
+               spa_config_exit(spa, SCL_ALLOC, FTAG);
+
        return (0);
 }
 
@@ -838,6 +884,7 @@ vdev_metaslab_fini(vdev_t *vd)
        uint64_t count = vd->vdev_ms_count;
 
        if (vd->vdev_ms != NULL) {
+               metaslab_group_passivate(vd->vdev_mg);
                for (m = 0; m < count; m++)
                        if (vd->vdev_ms[m] != NULL)
                                metaslab_fini(vd->vdev_ms[m]);
@@ -965,6 +1012,10 @@ vdev_probe(vdev_t *vd, zio_t *zio)
                    vdev_probe_done, vps,
                    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
+               /*
+                * We can't change the vdev state in this context, so we
+                * kick off an async task to do it on our behalf.
+                */
                if (zio != NULL) {
                        vd->vdev_probe_wanted = B_TRUE;
                        spa_async_request(spa, SPA_ASYNC_PROBE);
@@ -1007,12 +1058,35 @@ vdev_open_child(void *arg)
        vd->vdev_open_thread = NULL;
 }
 
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+       if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+           strlen(ZVOL_DIR)) == 0)
+               return (B_TRUE);
+       for (int c = 0; c < vd->vdev_children; c++)
+               if (vdev_uses_zvols(vd->vdev_child[c]))
+                       return (B_TRUE);
+       return (B_FALSE);
+}
+
 void
 vdev_open_children(vdev_t *vd)
 {
        taskq_t *tq;
        int children = vd->vdev_children;
 
+       /*
+        * in order to handle pools on top of zvols, do the opens
+        * in a single thread so that the same thread holds the
+        * spa_namespace_lock
+        */
+       if (vdev_uses_zvols(vd)) {
+               for (int c = 0; c < children; c++)
+                       vd->vdev_child[c]->vdev_open_error =
+                           vdev_open(vd->vdev_child[c]);
+               return;
+       }
        tq = taskq_create("vdev_open", children, minclsyspri,
            children, children, TASKQ_PREPOPULATE);
 
@@ -1046,10 +1120,16 @@ vdev_open(vdev_t *vd)
        vd->vdev_cant_write = B_FALSE;
        vd->vdev_min_asize = vdev_get_min_asize(vd);
 
+       /*
+        * If this vdev is not removed, check its fault status.  If it's
+        * faulted, bail out of the open.
+        */
        if (!vd->vdev_removed && vd->vdev_faulted) {
                ASSERT(vd->vdev_children == 0);
+               ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+                   vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
                vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-                   VDEV_AUX_ERR_EXCEEDED);
+                   vd->vdev_label_aux);
                return (ENXIO);
        } else if (vd->vdev_offline) {
                ASSERT(vd->vdev_children == 0);
@@ -1059,6 +1139,11 @@ vdev_open(vdev_t *vd)
 
        error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
 
+       /*
+        * Reset the vdev_reopening flag so that we actually close
+        * the vdev on error.
+        */
+       vd->vdev_reopening = B_FALSE;
        if (zio_injection_enabled && error == 0)
                error = zio_handle_device_injection(vd, NULL, ENXIO);
 
@@ -1074,14 +1159,33 @@ vdev_open(vdev_t *vd)
 
        vd->vdev_removed = B_FALSE;
 
+       /*
+        * Recheck the faulted flag now that we have confirmed that
+        * the vdev is accessible.  If we're faulted, bail.
+        */
+       if (vd->vdev_faulted) {
+               ASSERT(vd->vdev_children == 0);
+               ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+                   vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+               vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+                   vd->vdev_label_aux);
+               return (ENXIO);
+       }
+
        if (vd->vdev_degraded) {
                ASSERT(vd->vdev_children == 0);
                vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
                    VDEV_AUX_ERR_EXCEEDED);
        } else {
-               vd->vdev_state = VDEV_STATE_HEALTHY;
+               vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
        }
 
+       /*
+        * For hole or missing vdevs we just return success.
+        */
+       if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+               return (0);
+
        for (int c = 0; c < vd->vdev_children; c++) {
                if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1157,8 +1261,8 @@ vdev_open(vdev_t *vd)
         */
        if (vd->vdev_ops->vdev_op_leaf &&
            (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
-               vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_IO_FAILURE);
+               vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+                   VDEV_AUX_ERR_EXCEEDED);
                return (error);
        }
 
@@ -1189,7 +1293,7 @@ vdev_validate(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
        nvlist_t *label;
-       uint64_t guid, top_guid;
+       uint64_t guid = 0, top_guid;
        uint64_t state;
 
        for (int c = 0; c < vd->vdev_children; c++)
@@ -1202,6 +1306,8 @@ vdev_validate(vdev_t *vd)
         * overwrite the previous state.
         */
        if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+               uint64_t aux_guid = 0;
+               nvlist_t *nvl;
 
                if ((label = vdev_label_read_config(vd)) == NULL) {
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1209,6 +1315,18 @@ vdev_validate(vdev_t *vd)
                        return (0);
                }
 
+               /*
+                * Determine if this vdev has been split off into another
+                * pool.  If so, then refuse to open it.
+                */
+               if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+                   &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+                       vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+                           VDEV_AUX_SPLIT_POOL);
+                       nvlist_free(label);
+                       return (0);
+               }
+
                if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
                    &guid) != 0 || guid != spa_guid(spa)) {
                        vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1217,6 +1335,11 @@ vdev_validate(vdev_t *vd)
                        return (0);
                }
 
+               if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+                   != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+                   &aux_guid) != 0)
+                       aux_guid = 0;
+
                /*
                 * If this vdev just became a top-level vdev because its
                 * sibling was detached, it will have adopted the parent's
@@ -1224,12 +1347,16 @@ vdev_validate(vdev_t *vd)
                 * Fortunately, either version of the label will have the
                 * same top guid, so if we're a top-level vdev, we can
                 * safely compare to that instead.
+                *
+                * If we split this vdev off instead, then we also check the
+                * original pool's guid.  We don't want to consider the vdev
+                * corrupt if it is partway through a split operation.
                 */
                if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
                    &guid) != 0 ||
                    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
                    &top_guid) != 0 ||
-                   (vd->vdev_guid != guid &&
+                   ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
                    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
                        vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
                            VDEV_AUX_CORRUPT_DATA);
@@ -1252,7 +1379,7 @@ vdev_validate(vdev_t *vd)
                 * state of the pool.
                 */
                if (!spa->spa_load_verbatim &&
-                   spa->spa_load_state == SPA_LOAD_OPEN &&
+                   spa_load_state(spa) == SPA_LOAD_OPEN &&
                    state != POOL_STATE_ACTIVE)
                        return (EBADF);
 
@@ -1275,9 +1402,17 @@ void
 vdev_close(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
+       vdev_t *pvd = vd->vdev_parent;
 
        ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
+       /*
+        * If our parent is reopening, then we are as well, unless we are
+        * going offline.
+        */
+       if (pvd != NULL && pvd->vdev_reopening)
+               vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
        vd->vdev_ops->vdev_op_close(vd);
 
        vdev_cache_purge(vd);
@@ -1297,12 +1432,49 @@ vdev_close(vdev_t *vd)
 }
 
 void
+vdev_hold(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT(spa_is_root(spa));
+       if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+               return;
+
+       for (int c = 0; c < vd->vdev_children; c++)
+               vdev_hold(vd->vdev_child[c]);
+
+       if (vd->vdev_ops->vdev_op_leaf)
+               vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT(spa_is_root(spa));
+       for (int c = 0; c < vd->vdev_children; c++)
+               vdev_rele(vd->vdev_child[c]);
+
+       if (vd->vdev_ops->vdev_op_leaf)
+               vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves.  We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
 vdev_reopen(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
 
        ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
+       /* set the reopening flag unless we're taking the vdev offline */
+       vd->vdev_reopening = !vd->vdev_offline;
        vdev_close(vd);
        (void) vdev_open(vd);
 
@@ -1370,6 +1542,7 @@ void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
        ASSERT(vd == vd->vdev_top);
+       ASSERT(!vd->vdev_ishole);
        ASSERT(ISP2(flags));
 
        if (flags & VDD_METASLAB)
@@ -1385,7 +1558,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication.  There are three kinds of DTL:
+ * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
@@ -1479,14 +1652,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                vdev_dtl_reassess(vd->vdev_child[c], txg,
                    scrub_txg, scrub_done);
 
-       if (vd == spa->spa_root_vdev)
+       if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
                return;
 
        if (vd->vdev_ops->vdev_op_leaf) {
+               dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
                mutex_enter(&vd->vdev_dtl_lock);
                if (scrub_txg != 0 &&
-                   (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
-                       /* XXX should check scrub_done? */
+                   (spa->spa_scrub_started ||
+                   (scn && scn->scn_phys.scn_errors == 0))) {
                        /*
                         * We completed a scrub up to scrub_txg.  If we
                         * did it without rebooting, then the scrub dtl
@@ -1534,6 +1709,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 
        mutex_enter(&vd->vdev_dtl_lock);
        for (int t = 0; t < DTL_TYPES; t++) {
+               /* account for child's outage in parent's missing map */
+               int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
                if (t == DTL_SCRUB)
                        continue;                       /* leaf vdevs only */
                if (t == DTL_PARTIAL)
@@ -1546,7 +1723,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                for (int c = 0; c < vd->vdev_children; c++) {
                        vdev_t *cvd = vd->vdev_child[c];
                        mutex_enter(&cvd->vdev_dtl_lock);
-                       space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+                       space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
                        mutex_exit(&cvd->vdev_dtl_lock);
                }
                space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
@@ -1569,6 +1746,8 @@ vdev_dtl_load(vdev_t *vd)
        if (smo->smo_object == 0)
                return (0);
 
+       ASSERT(!vd->vdev_ishole);
+
        if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
                return (error);
 
@@ -1596,6 +1775,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
        dmu_buf_t *db;
        dmu_tx_t *tx;
 
+       ASSERT(!vd->vdev_ishole);
+
        tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
        if (vd->vdev_detached) {
@@ -1732,7 +1913,7 @@ vdev_load(vdev_t *vd)
        /*
         * If this is a top-level vdev, initialize its metaslabs.
         */
-       if (vd == vd->vdev_top &&
+       if (vd == vd->vdev_top && !vd->vdev_ishole &&
            (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
            vdev_metaslab_init(vd, 0) != 0))
                vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1789,12 +1970,54 @@ vdev_validate_aux(vdev_t *vd)
 }
 
 void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+       spa_t *spa = vd->vdev_spa;
+       objset_t *mos = spa->spa_meta_objset;
+       dmu_tx_t *tx;
+
+       tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+       if (vd->vdev_dtl_smo.smo_object) {
+               ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+               (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+               vd->vdev_dtl_smo.smo_object = 0;
+       }
+
+       if (vd->vdev_ms != NULL) {
+               for (int m = 0; m < vd->vdev_ms_count; m++) {
+                       metaslab_t *msp = vd->vdev_ms[m];
+
+                       if (msp == NULL || msp->ms_smo.smo_object == 0)
+                               continue;
+
+                       ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+                       (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+                       msp->ms_smo.smo_object = 0;
+               }
+       }
+
+       if (vd->vdev_ms_array) {
+               (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+               vd->vdev_ms_array = 0;
+               vd->vdev_ms_shift = 0;
+       }
+       dmu_tx_commit(tx);
+}
+
+void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
        metaslab_t *msp;
+       boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+       ASSERT(!vd->vdev_ishole);
 
        while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
                metaslab_sync_done(msp, txg);
+
+       if (reassess)
+               metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void
@@ -1805,6 +2028,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
        metaslab_t *msp;
        dmu_tx_t *tx;
 
+       ASSERT(!vd->vdev_ishole);
+
        if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
                ASSERT(vd == vd->vdev_top);
                tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1815,6 +2040,12 @@ vdev_sync(vdev_t *vd, uint64_t txg)
                dmu_tx_commit(tx);
        }
 
+       /*
+        * Remove the metadata associated with this vdev once it's empty.
+        */
+       if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+               vdev_remove(vd, txg);
+
        while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
                metaslab_sync(msp, txg);
                (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -1837,11 +2068,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
  * not be opened, and no I/O is attempted.
  */
 int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
        vdev_t *vd;
 
-       spa_vdev_state_enter(spa);
+       spa_vdev_state_enter(spa, SCL_NONE);
 
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
                return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1850,18 +2081,25 @@ vdev_fault(spa_t *spa, uint64_t guid)
                return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
        /*
+        * We don't directly use the aux state here, but if we do a
+        * vdev_reopen(), we need this value to be present to remember why we
+        * were faulted.
+        */
+       vd->vdev_label_aux = aux;
+
+       /*
         * Faulted state takes precedence over degraded.
         */
+       vd->vdev_delayed_close = B_FALSE;
        vd->vdev_faulted = 1ULL;
        vd->vdev_degraded = 0ULL;
-       vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+       vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
        /*
-        * If marking the vdev as faulted cause the top-level vdev to become
-        * unavailable, then back off and simply mark the vdev as degraded
-        * instead.
+        * If this device has the only valid copy of the data, then
+        * back off and simply mark the vdev as degraded instead.
         */
-       if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+       if (!vd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
                vd->vdev_degraded = 1ULL;
                vd->vdev_faulted = 0ULL;
 
@@ -1871,10 +2109,8 @@ vdev_fault(spa_t *spa, uint64_t guid)
                 */
                vdev_reopen(vd);
 
-               if (vdev_readable(vd)) {
-                       vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-                           VDEV_AUX_ERR_EXCEEDED);
-               }
+               if (vdev_readable(vd))
+                       vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
        }
 
        return (spa_vdev_state_exit(spa, vd, 0));
@@ -1886,11 +2122,11 @@ vdev_fault(spa_t *spa, uint64_t guid)
  * as I/O is concerned.
  */
 int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
        vdev_t *vd;
 
-       spa_vdev_state_enter(spa);
+       spa_vdev_state_enter(spa, SCL_NONE);
 
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
                return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1907,7 +2143,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
        vd->vdev_degraded = 1ULL;
        if (!vdev_is_dead(vd))
                vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-                   VDEV_AUX_ERR_EXCEEDED);
+                   aux);
 
        return (spa_vdev_state_exit(spa, vd, 0));
 }
@@ -1923,7 +2159,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
        vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 
-       spa_vdev_state_enter(spa);
+       spa_vdev_state_enter(spa, SCL_NONE);
 
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
                return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1969,13 +2205,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
        return (spa_vdev_state_exit(spa, vd, 0));
 }
 
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
        vdev_t *vd, *tvd;
-       int error;
+       int error = 0;
+       uint64_t generation;
+       metaslab_group_t *mg;
 
-       spa_vdev_state_enter(spa);
+top:
+       spa_vdev_state_enter(spa, SCL_ALLOC);
 
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
                return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1984,6 +2223,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
                return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
        tvd = vd->vdev_top;
+       mg = tvd->vdev_mg;
+       generation = spa->spa_config_generation + 1;
 
        /*
         * If the device isn't already offline, try to offline it.
@@ -1999,6 +2240,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
                        return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
                /*
+                * If the top-level is a slog and it has had allocations
+                * then proceed.  We check that the vdev's metaslab group
+                * is not NULL since it's possible that we may have just
+                * added this vdev but not yet initialized its metaslabs.
+                */
+               if (tvd->vdev_islog && mg != NULL) {
+                       /*
+                        * Prevent any future allocations.
+                        */
+                       metaslab_group_passivate(mg);
+                       (void) spa_vdev_state_exit(spa, vd, 0);
+
+                       error = spa_offline_log(spa);
+
+                       spa_vdev_state_enter(spa, SCL_ALLOC);
+
+                       /*
+                        * Check to see if the config has changed.
+                        */
+                       if (error || generation != spa->spa_config_generation) {
+                               metaslab_group_activate(mg);
+                               if (error)
+                                       return (spa_vdev_state_exit(spa,
+                                           vd, error));
+                               (void) spa_vdev_state_exit(spa, vd, 0);
+                               goto top;
+                       }
+                       ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+               }
+
+               /*
                 * Offline this device and reopen its top-level vdev.
                 * If the top-level vdev is a log device then just offline
                 * it. Otherwise, if this action results in the top-level
@@ -2013,28 +2285,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
                        vdev_reopen(tvd);
                        return (spa_vdev_state_exit(spa, NULL, EBUSY));
                }
+
+               /*
+                * Add the device back into the metaslab rotor so that
+                * once we online the device it's open for business.
+                */
+               if (tvd->vdev_islog && mg != NULL)
+                       metaslab_group_activate(mg);
        }
 
        vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
-       if (!tvd->vdev_islog || !vdev_is_dead(tvd))
-               return (spa_vdev_state_exit(spa, vd, 0));
+       return (spa_vdev_state_exit(spa, vd, 0));
+}
 
-       (void) spa_vdev_state_exit(spa, vd, 0);
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+       int error;
 
-       error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-           NULL, DS_FIND_CHILDREN);
-       if (error) {
-               (void) vdev_online(spa, guid, 0, NULL);
-               return (error);
-       }
-       /*
-        * If we successfully offlined the log device then we need to
-        * sync out the current txg so that the "stubby" block can be
-        * removed by zil_sync().
-        */
-       txg_wait_synced(spa->spa_dsl_pool, 0);
-       return (0);
+       mutex_enter(&spa->spa_vdev_top_lock);
+       error = vdev_offline_locked(spa, guid, flags);
+       mutex_exit(&spa->spa_vdev_top_lock);
+
+       return (error);
 }
 
 /*
@@ -2068,12 +2342,21 @@ vdev_clear(spa_t *spa, vdev_t *vd)
        if (vd->vdev_faulted || vd->vdev_degraded ||
            !vdev_readable(vd) || !vdev_writeable(vd)) {
 
+               /*
+                * When reopening in reponse to a clear event, it may be due to
+                * a fmadm repair request.  In this case, if the device is
+                * still broken, we want to still post the ereport again.
+                */
+               vd->vdev_forcefault = B_TRUE;
+
                vd->vdev_faulted = vd->vdev_degraded = 0;
                vd->vdev_cant_read = B_FALSE;
                vd->vdev_cant_write = B_FALSE;
 
                vdev_reopen(vd);
 
+               vd->vdev_forcefault = B_FALSE;
+
                if (vd != rvd)
                        vdev_state_dirty(vd->vdev_top);
 
@@ -2082,12 +2365,30 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 
                spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
        }
+
+       /*
+        * When clearing a FMA-diagnosed fault, we always want to
+        * unspare the device, as we assume that the original spare was
+        * done in response to the FMA fault.
+        */
+       if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+           vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+           vd->vdev_parent->vdev_child[0] == vd)
+               vd->vdev_unspare = B_TRUE;
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
-       return (vd->vdev_state < VDEV_STATE_DEGRADED);
+       /*
+        * Holes and missing devices are always considered "dead".
+        * This simplifies the code since we don't have to check for
+        * these types of devices in the various code paths.
+        * Instead we rely on the fact that we skip over dead devices
+        * before issuing I/O to them.
+        */
+       return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+           vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
@@ -2116,7 +2417,7 @@ vdev_allocatable(vdev_t *vd)
         * we're asking two separate questions about it.
         */
        return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-           !vd->vdev_cant_write);
+           !vd->vdev_cant_write && !vd->vdev_ishole);
 }
 
 boolean_t
@@ -2146,7 +2447,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 
        mutex_enter(&vd->vdev_stat_lock);
        bcopy(&vd->vdev_stat, vs, sizeof (*vs));
-       vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
        vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
        vs->vs_state = vd->vdev_state;
        vs->vs_rsize = vdev_get_min_asize(vd);
@@ -2168,7 +2468,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
                                vs->vs_ops[t] += cvs->vs_ops[t];
                                vs->vs_bytes[t] += cvs->vs_bytes[t];
                        }
-                       vs->vs_scrub_examined += cvs->vs_scrub_examined;
+                       cvs->vs_scan_removing = cvd->vdev_removing;
                        mutex_exit(&vd->vdev_stat_lock);
                }
        }
@@ -2185,6 +2485,19 @@ vdev_clear_stats(vdev_t *vd)
 }
 
 void
+vdev_scan_stat_init(vdev_t *vd)
+{
+       vdev_stat_t *vs = &vd->vdev_stat;
+
+       for (int c = 0; c < vd->vdev_children; c++)
+               vdev_scan_stat_init(vd->vdev_child[c]);
+
+       mutex_enter(&vd->vdev_stat_lock);
+       vs->vs_scan_processed = 0;
+       mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
        spa_t *spa = zio->io_spa;
@@ -2228,8 +2541,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                mutex_enter(&vd->vdev_stat_lock);
 
                if (flags & ZIO_FLAG_IO_REPAIR) {
-                       if (flags & ZIO_FLAG_SCRUB_THREAD)
-                               vs->vs_scrub_repaired += psize;
+                       if (flags & ZIO_FLAG_SCRUB_THREAD) {
+                               dsl_scan_phys_t *scn_phys =
+                                   &spa->spa_dsl_pool->dp_scan->scn_phys;
+                               uint64_t *processed = &scn_phys->scn_processed;
+
+                               /* XXX cleanup? */
+                               if (vd->vdev_ops->vdev_op_leaf)
+                                       atomic_add_64(processed, psize);
+                               vs->vs_scan_processed += psize;
+                       }
+
                        if (flags & ZIO_FLAG_SELF_HEAL)
                                vs->vs_self_healed += psize;
                }
@@ -2254,6 +2576,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
            !(zio->io_flags & ZIO_FLAG_IO_RETRY))
                return;
 
+       /*
+        * Intent logs writes won't propagate their error to the root
+        * I/O so don't mark these types of failures as pool-level
+        * errors.
+        */
+       if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+               return;
+
        mutex_enter(&vd->vdev_stat_lock);
        if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
                if (zio->io_error == ECKSUM)
@@ -2267,14 +2597,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
        if (type == ZIO_TYPE_WRITE && txg != 0 &&
            (!(flags & ZIO_FLAG_IO_REPAIR) ||
-           (flags & ZIO_FLAG_SCRUB_THREAD))) {
+           (flags & ZIO_FLAG_SCRUB_THREAD) ||
+           spa->spa_claiming)) {
                /*
-                * This is either a normal write (not a repair), or it's a
-                * repair induced by the scrub thread.  In the normal case,
-                * we commit the DTL change in the same txg as the block
-                * was born.  In the scrub-induced repair case, we know that
-                * scrubs run in first-pass syncing context, so we commit
-                * the DTL change in spa->spa_syncing_txg.
+                * This is either a normal write (not a repair), or it's
+                * a repair induced by the scrub thread, or it's a repair
+                * made by zil_claim() during spa_load() in the first txg.
+                * In the normal case, we commit the DTL change in the same
+                * txg as the block was born.  In the scrub-induced repair
+                * case, we know that scrubs run in first-pass syncing context,
+                * so we commit the DTL change in spa_syncing_txg(spa).
+                * In the zil_claim() case, we commit in spa_first_txg(spa).
                 *
                 * We currently do not make DTL entries for failed spontaneous
                 * self-healing writes triggered by normal (non-scrubbing)
@@ -2287,9 +2620,12 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                                ASSERT(flags & ZIO_FLAG_IO_REPAIR);
                                ASSERT(spa_sync_pass(spa) == 1);
                                vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
-                               commit_txg = spa->spa_syncing_txg;
+                               commit_txg = spa_syncing_txg(spa);
+                       } else if (spa->spa_claiming) {
+                               ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+                               commit_txg = spa_first_txg(spa);
                        }
-                       ASSERT(commit_txg >= spa->spa_syncing_txg);
+                       ASSERT(commit_txg >= spa_syncing_txg(spa));
                        if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
                                return;
                        for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2301,45 +2637,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
        }
 }
 
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
-       vdev_stat_t *vs = &vd->vdev_stat;
-
-       for (int c = 0; c < vd->vdev_children; c++)
-               vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
-       mutex_enter(&vd->vdev_stat_lock);
-
-       if (type == POOL_SCRUB_NONE) {
-               /*
-                * Update completion and end time.  Leave everything else alone
-                * so we can report what happened during the previous scrub.
-                */
-               vs->vs_scrub_complete = complete;
-               vs->vs_scrub_end = gethrestime_sec();
-       } else {
-               vs->vs_scrub_type = type;
-               vs->vs_scrub_complete = 0;
-               vs->vs_scrub_examined = 0;
-               vs->vs_scrub_repaired = 0;
-               vs->vs_scrub_start = gethrestime_sec();
-               vs->vs_scrub_end = 0;
-       }
-
-       mutex_exit(&vd->vdev_stat_lock);
-}
-
 /*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
  */
 void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
-    boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta)
 {
        int64_t dspace_delta = space_delta;
        spa_t *spa = vd->vdev_spa;
        vdev_t *rvd = spa->spa_root_vdev;
+       metaslab_group_t *mg = vd->vdev_mg;
+       metaslab_class_t *mc = mg ? mg->mg_class : NULL;
 
        ASSERT(vd == vd->vdev_top);
 
@@ -2355,28 +2665,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
            vd->vdev_deflate_ratio;
 
        mutex_enter(&vd->vdev_stat_lock);
-       vd->vdev_stat.vs_space += space_delta;
        vd->vdev_stat.vs_alloc += alloc_delta;
+       vd->vdev_stat.vs_space += space_delta;
        vd->vdev_stat.vs_dspace += dspace_delta;
        mutex_exit(&vd->vdev_stat_lock);
 
-       if (update_root) {
-               ASSERT(rvd == vd->vdev_parent);
-               ASSERT(vd->vdev_ms_count != 0);
-
-               /*
-                * Don't count non-normal (e.g. intent log) space as part of
-                * the pool's capacity.
-                */
-               if (vd->vdev_mg->mg_class != spa->spa_normal_class)
-                       return;
-
+       if (mc == spa_normal_class(spa)) {
                mutex_enter(&rvd->vdev_stat_lock);
-               rvd->vdev_stat.vs_space += space_delta;
                rvd->vdev_stat.vs_alloc += alloc_delta;
+               rvd->vdev_stat.vs_space += space_delta;
                rvd->vdev_stat.vs_dspace += dspace_delta;
                mutex_exit(&rvd->vdev_stat_lock);
        }
+
+       if (mc != NULL) {
+               ASSERT(rvd == vd->vdev_parent);
+               ASSERT(vd->vdev_ms_count != 0);
+
+               metaslab_class_space_update(mc,
+                   alloc_delta, defer_delta, space_delta, dspace_delta);
+       }
 }
 
 /*
@@ -2428,7 +2736,7 @@ vdev_config_dirty(vdev_t *vd)
                 * sketchy, but it will work.
                 */
                nvlist_free(aux[c]);
-               aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+               aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
                return;
        }
@@ -2449,7 +2757,8 @@ vdev_config_dirty(vdev_t *vd)
        } else {
                ASSERT(vd == vd->vdev_top);
 
-               if (!list_link_active(&vd->vdev_config_dirty_node))
+               if (!list_link_active(&vd->vdev_config_dirty_node) &&
+                   !vd->vdev_ishole)
                        list_insert_head(&spa->spa_config_dirty_list, vd);
        }
 }
@@ -2490,7 +2799,7 @@ vdev_state_dirty(vdev_t *vd)
            (dsl_pool_sync_context(spa_get_dsl(spa)) &&
            spa_config_held(spa, SCL_STATE, RW_READER)));
 
-       if (!list_link_active(&vd->vdev_state_dirty_node))
+       if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
                list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
@@ -2523,6 +2832,12 @@ vdev_propagate_state(vdev_t *vd)
                for (int c = 0; c < vd->vdev_children; c++) {
                        child = vd->vdev_child[c];
 
+                       /*
+                        * Don't factor holes into the decision.
+                        */
+                       if (child->vdev_ishole)
+                               continue;
+
                        if (!vdev_readable(child) ||
                            (!vdev_writeable(child) && spa_writeable(spa))) {
                                /*
@@ -2586,15 +2901,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 
        /*
         * If we are setting the vdev state to anything but an open state, then
-        * always close the underlying device.  Otherwise, we keep accessible
-        * but invalid devices open forever.  We don't call vdev_close() itself,
-        * because that implies some extra checks (offline, etc) that we don't
-        * want here.  This is limited to leaf devices, because otherwise
-        * closing the device will affect other children.
+        * always close the underlying device unless the device has requested
+        * a delayed close (i.e. we're about to remove or fault the device).
+        * Otherwise, we keep accessible but invalid devices open forever.
+        * We don't call vdev_close() itself, because that implies some extra
+        * checks (offline, etc) that we don't want here.  This is limited to
+        * leaf devices, because otherwise closing the device will affect other
+        * children.
         */
-       if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+       if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+           vd->vdev_ops->vdev_op_leaf)
                vd->vdev_ops->vdev_op_close(vd);
 
+       /*
+        * If we have brought this vdev back into service, we need
+        * to notify fmd so that it can gracefully repair any outstanding
+        * cases due to a missing device.  We do this in all cases, even those
+        * that probably don't correlate to a repaired fault.  This is sure to
+        * catch all cases, and we let the zfs-retire agent sort it out.  If
+        * this is a transient state it's OK, as the retire agent will
+        * double-check the state of the vdev before repairing it.
+        */
+       if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+           vd->vdev_prevstate != state)
+               zfs_post_state_change(spa, vd);
+
        if (vd->vdev_removed &&
            state == VDEV_STATE_CANT_OPEN &&
            (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2610,11 +2941,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
                vd->vdev_state = VDEV_STATE_REMOVED;
                vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
        } else if (state == VDEV_STATE_REMOVED) {
-               /*
-                * Indicate to the ZFS DE that this device has been removed, and
-                * any recent errors should be ignored.
-                */
-               zfs_post_remove(spa, vd);
                vd->vdev_removed = B_TRUE;
        } else if (state == VDEV_STATE_CANT_OPEN) {
                /*
@@ -2623,7 +2949,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
                 * begin with.  Failure to open such a device is not considered
                 * an error.
                 */
-               if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+               if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
                    vd->vdev_ops->vdev_op_leaf)
                        vd->vdev_not_present = 1;
 
@@ -2666,9 +2992,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
                        case VDEV_AUX_BAD_LABEL:
                                class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
                                break;
-                       case VDEV_AUX_IO_FAILURE:
-                               class = FM_EREPORT_ZFS_IO_FAILURE;
-                               break;
                        default:
                                class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
                        }
@@ -2716,32 +3039,31 @@ vdev_is_bootable(vdev_t *vd)
        return (B_TRUE);
 }
 
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
 void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
-       uint_t children;
-       nvlist_t **child;
-       uint64_t val;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = nvd->vdev_spa;
 
-       if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-           &child, &children) == 0) {
-               for (int c = 0; c < children; c++)
-                       vdev_load_log_state(vd->vdev_child[c], child[c]);
-       }
+       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+       ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
-       if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
-           ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+       for (int c = 0; c < nvd->vdev_children; c++)
+               vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
+       if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
                /*
                 * It would be nice to call vdev_offline()
                 * directly but the pool isn't fully loaded and
                 * the txg threads have not been started yet.
                 */
-               spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
-               vd->vdev_offline = val;
-               vdev_reopen(vd->vdev_top);
-               spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+               nvd->vdev_offline = ovd->vdev_offline;
+               vdev_reopen(nvd->vdev_top);
        }
 }
 
@@ -2759,3 +3081,22 @@ vdev_expand(vdev_t *vd, uint64_t txg)
                vdev_config_dirty(vd);
        }
 }
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+       vdev_t *cvd, *pvd = vd->vdev_parent;
+
+       vdev_remove_child(pvd, vd);
+       vdev_compact_children(pvd);
+
+       cvd = pvd->vdev_child[0];
+       if (pvd->vdev_children == 1) {
+               vdev_remove_parent(cvd);
+               cvd->vdev_splitting = B_TRUE;
+       }
+       vdev_propagate_state(cvd);
+}
index 9b3a9f5..688d541 100644 (file)
@@ -172,7 +172,7 @@ vdev_cache_allocate(zio_t *zio)
 
        ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
        ve->ve_offset = offset;
-       ve->ve_lastused = lbolt;
+       ve->ve_lastused = ddi_get_lbolt();
        ve->ve_data = zio_buf_alloc(VCBS);
 
        avl_add(&vc->vc_offset_tree, ve);
@@ -189,9 +189,9 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
        ASSERT(MUTEX_HELD(&vc->vc_lock));
        ASSERT(ve->ve_fill_io == NULL);
 
-       if (ve->ve_lastused != lbolt) {
+       if (ve->ve_lastused != ddi_get_lbolt()) {
                avl_remove(&vc->vc_lastused_tree, ve);
-               ve->ve_lastused = lbolt;
+               ve->ve_lastused = ddi_get_lbolt();
                avl_add(&vc->vc_lastused_tree, ve);
        }
 
index f91dddb..8c22aa5 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
  * Virtual device vector for files.
  */
 
+static void
+vdev_file_hold(vdev_t *vd)
+{
+       ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+       ASSERT(vd->vdev_path != NULL);
+}
+
 static int
 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
@@ -51,6 +62,16 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
                return (EINVAL);
        }
 
+       /*
+        * Reopen the device if it's not currently open.  Otherwise,
+        * just update the physical size of the device.
+        */
+       if (vd->vdev_tsd != NULL) {
+               ASSERT(vd->vdev_reopening);
+               vf = vd->vdev_tsd;
+               goto skip_open;
+       }
+
        vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
        /*
@@ -79,6 +100,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
                return (ENODEV);
        }
 #endif
+
+skip_open:
        /*
         * Determine the physical size of the file.
         */
@@ -100,7 +123,7 @@ vdev_file_close(vdev_t *vd)
 {
        vdev_file_t *vf = vd->vdev_tsd;
 
-       if (vf == NULL)
+       if (vd->vdev_reopening || vf == NULL)
                return;
 
        if (vf->vf_vnode != NULL) {
@@ -110,6 +133,7 @@ vdev_file_close(vdev_t *vd)
                VN_RELE(vf->vf_vnode);
        }
 
+       vd->vdev_delayed_close = B_FALSE;
        kmem_free(vf, sizeof (vdev_file_t));
        vd->vdev_tsd = NULL;
 }
@@ -166,6 +190,8 @@ vdev_ops_t vdev_file_ops = {
        vdev_file_io_start,
        vdev_file_io_done,
        NULL,
+       vdev_file_hold,
+       vdev_file_rele,
        VDEV_TYPE_FILE,         /* name of this vdev type */
        B_TRUE                  /* leaf vdev */
 };
@@ -182,6 +208,8 @@ vdev_ops_t vdev_disk_ops = {
        vdev_file_io_start,
        vdev_file_io_done,
        NULL,
+       vdev_file_hold,
+       vdev_file_rele,
        VDEV_TYPE_DISK,         /* name of this vdev type */
        B_TRUE                  /* leaf vdev */
 };
index 06cb720..75ec545 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zio.h>
+#include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 
 /*
@@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    boolean_t isspare, boolean_t isl2cache)
+    vdev_config_flag_t flags)
 {
        nvlist_t *nv = NULL;
 
@@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
        VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
            vd->vdev_ops->vdev_op_type) == 0);
-       if (!isspare && !isl2cache)
+       if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
                    == 0);
        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -270,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
        if (vd->vdev_isspare)
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
 
-       if (!isspare && !isl2cache && vd == vd->vdev_top) {
+       if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+           vd == vd->vdev_top) {
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
                    vd->vdev_ms_array) == 0);
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -281,39 +282,74 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                    vd->vdev_asize) == 0);
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
                    vd->vdev_islog) == 0);
+               if (vd->vdev_removing)
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+                           vd->vdev_removing) == 0);
        }
 
        if (vd->vdev_dtl_smo.smo_object != 0)
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
                    vd->vdev_dtl_smo.smo_object) == 0);
 
+       if (vd->vdev_crtxg)
+               VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+                   vd->vdev_crtxg) == 0);
+
        if (getstats) {
                vdev_stat_t vs;
+               pool_scan_stat_t ps;
+
                vdev_get_stats(vd, &vs);
-               VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+               VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
                    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+               /* provide either current or previous scan information */
+               if (spa_scan_get_stats(spa, &ps) == 0) {
+                       VERIFY(nvlist_add_uint64_array(nv,
+                           ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+                           sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+                           == 0);
+               }
        }
 
        if (!vd->vdev_ops->vdev_op_leaf) {
                nvlist_t **child;
-               int c;
+               int c, idx;
+
+               ASSERT(!vd->vdev_ishole);
 
                child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
                    KM_SLEEP);
 
-               for (c = 0; c < vd->vdev_children; c++)
-                       child[c] = vdev_config_generate(spa, vd->vdev_child[c],
-                           getstats, isspare, isl2cache);
+               for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+                       vdev_t *cvd = vd->vdev_child[c];
 
-               VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-                   child, vd->vdev_children) == 0);
+                       /*
+                        * If we're generating an nvlist of removing
+                        * vdevs then skip over any device which is
+                        * not being removed.
+                        */
+                       if ((flags & VDEV_CONFIG_REMOVING) &&
+                           !cvd->vdev_removing)
+                               continue;
 
-               for (c = 0; c < vd->vdev_children; c++)
+                       child[idx++] = vdev_config_generate(spa, cvd,
+                           getstats, flags);
+               }
+
+               if (idx) {
+                       VERIFY(nvlist_add_nvlist_array(nv,
+                           ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+               }
+
+               for (c = 0; c < idx; c++)
                        nvlist_free(child[c]);
 
                kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
        } else {
+               const char *aux = NULL;
+
                if (vd->vdev_offline && !vd->vdev_tmpoffline)
                        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
                            B_TRUE) == 0);
@@ -329,11 +365,66 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                if (vd->vdev_unspare)
                        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
                            B_TRUE) == 0);
+               if (vd->vdev_ishole)
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+                           B_TRUE) == 0);
+
+               switch (vd->vdev_stat.vs_aux) {
+               case VDEV_AUX_ERR_EXCEEDED:
+                       aux = "err_exceeded";
+                       break;
+
+               case VDEV_AUX_EXTERNAL:
+                       aux = "external";
+                       break;
+               }
+
+               if (aux != NULL)
+                       VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
+                           aux) == 0);
+
+               if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+                           vd->vdev_orig_guid) == 0);
+               }
        }
 
        return (nv);
 }
 
+/*
+ * Generate a view of the top-level vdevs.  If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs.  Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t *array;
+       uint_t c, idx;
+
+       array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+       for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+
+               if (tvd->vdev_ishole)
+                       array[idx++] = c;
+       }
+
+       if (idx) {
+               VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+                   array, idx) == 0);
+       }
+
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+           rvd->vdev_children) == 0);
+
+       kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
 nvlist_t *
 vdev_label_read_config(vdev_t *vd)
 {
@@ -516,6 +607,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                    crtxg, reason)) != 0)
                        return (error);
 
+       /* Track the creation time for this vdev */
+       vd->vdev_crtxg = crtxg;
+
        if (!vd->vdev_ops->vdev_op_leaf)
                return (0);
 
@@ -528,7 +622,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
        /*
         * Determine if the vdev is in use.
         */
-       if (reason != VDEV_LABEL_REMOVE &&
+       if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
            vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
                return (EBUSY);
 
@@ -554,7 +648,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                 */
                if (reason == VDEV_LABEL_SPARE)
                        return (0);
-               ASSERT(reason == VDEV_LABEL_REPLACE);
+               ASSERT(reason == VDEV_LABEL_REPLACE ||
+                   reason == VDEV_LABEL_SPLIT);
        }
 
        if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
@@ -619,7 +714,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
                    vd->vdev_guid) == 0);
        } else {
-               label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+               uint64_t txg = 0ULL;
+
+               if (reason == VDEV_LABEL_SPLIT)
+                       txg = spa->spa_uberblock.ub_txg;
+               label = spa_config_generate(spa, vd, txg, B_FALSE);
 
                /*
                 * Add our creation time.  This allows us to detect multiple
@@ -717,11 +816,6 @@ retry:
  */
 
 /*
- * For use by zdb and debugging purposes only
- */
-uint64_t ub_max_txg = UINT64_MAX;
-
-/*
  * Consider the following situation: txg is safely synced to disk.  We've
  * written the first uberblock for txg + 1, and then we lose power.  When we
  * come back up, we fail to see the uberblock for txg + 1 because, say,
@@ -750,6 +844,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+       spa_t *spa = zio->io_spa;
        zio_t *rio = zio->io_private;
        uberblock_t *ub = zio->io_data;
        uberblock_t *ubbest = rio->io_private;
@@ -758,7 +853,7 @@ vdev_uberblock_load_done(zio_t *zio)
 
        if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
                mutex_enter(&rio->io_lock);
-               if (ub->ub_txg <= ub_max_txg &&
+               if (ub->ub_txg <= spa->spa_load_max_txg &&
                    vdev_uberblock_compare(ub, ubbest) > 0)
                        *ubbest = *ub;
                mutex_exit(&rio->io_lock);
@@ -976,6 +1071,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
        for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
                uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
                    KM_SLEEP);
+
+               ASSERT(!vd->vdev_ishole);
+
                zio_t *vio = zio_null(zio, spa, NULL,
                    (vd->vdev_islog || vd->vdev_aux != NULL) ?
                    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
index 836386d..698c027 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,6 +60,11 @@ vdev_mirror_map_free(zio_t *zio)
        kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
 }
 
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+       vdev_mirror_map_free,
+       zio_vsd_default_cksum_report
+};
+
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
@@ -117,7 +122,7 @@ vdev_mirror_map_alloc(zio_t *zio)
        }
 
        zio->io_vsd = mm;
-       zio->io_vsd_free = vdev_mirror_map_free;
+       zio->io_vsd_ops = &vdev_mirror_vsd_ops;
        return (mm);
 }
 
@@ -209,7 +214,7 @@ vdev_mirror_child_select(zio_t *zio)
        uint64_t txg = zio->io_txg;
        int i, c;
 
-       ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+       ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
        /*
         * Try to find a child whose DTL doesn't contain the block to read.
@@ -447,6 +452,8 @@ vdev_ops_t vdev_mirror_ops = {
        vdev_mirror_io_start,
        vdev_mirror_io_done,
        vdev_mirror_state_change,
+       NULL,
+       NULL,
        VDEV_TYPE_MIRROR,       /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
@@ -458,6 +465,8 @@ vdev_ops_t vdev_replacing_ops = {
        vdev_mirror_io_start,
        vdev_mirror_io_done,
        vdev_mirror_state_change,
+       NULL,
+       NULL,
        VDEV_TYPE_REPLACING,    /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
@@ -469,6 +478,8 @@ vdev_ops_t vdev_spare_ops = {
        vdev_mirror_io_start,
        vdev_mirror_io_done,
        vdev_mirror_state_change,
+       NULL,
+       NULL,
        VDEV_TYPE_SPARE,        /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
index 731f7d3..6a5588d 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
         * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
         * will fail the GUID sum check before ever trying to open the pool.
         */
-       *psize = SPA_MINDEVSIZE;
-       *ashift = SPA_MINBLOCKSHIFT;
+       *psize = 0;
+       *ashift = 0;
        return (0);
 }
 
@@ -80,6 +80,21 @@ vdev_ops_t vdev_missing_ops = {
        vdev_missing_io_start,
        vdev_missing_io_done,
        NULL,
+       NULL,
+       NULL,
        VDEV_TYPE_MISSING,      /* name of this vdev type */
        B_TRUE                  /* leaf vdev */
 };
+
+vdev_ops_t vdev_hole_ops = {
+       vdev_missing_open,
+       vdev_missing_close,
+       vdev_default_asize,
+       vdev_missing_io_start,
+       vdev_missing_io_done,
+       NULL,
+       NULL,
+       NULL,
+       VDEV_TYPE_HOLE,         /* name of this vdev type */
+       B_TRUE                  /* leaf vdev */
+};
index 9867d09..5a0d3ee 100644 (file)
@@ -24,7 +24,6 @@
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
  * of i/os pending to each device (before it starts ramping up to
  * max_pending).
  */
-int zfs_vdev_max_pending = 35;
+int zfs_vdev_max_pending = 10;
 int zfs_vdev_min_pending = 4;
 
-/* deadline = pri + (lbolt >> time_shift) */
+/* deadline = pri + ddi_get_lbolt64() >> time_shift) */
 int zfs_vdev_time_shift = 6;
 
 /* exponential I/O issue ramp-up rate */
@@ -286,7 +285,7 @@ again:
                ASSERT(size <= zfs_vdev_aggregation_limit);
 
                aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-                   zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
+                   zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
                    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
                    vdev_queue_agg_io_done, NULL);
 
@@ -360,7 +359,8 @@ vdev_queue_io(zio_t *zio)
 
        mutex_enter(&vq->vq_lock);
 
-       zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority;
+       zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+           zio->io_priority;
 
        vdev_queue_io_add(vq, zio);
 
index b307417..4b0f560 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -103,6 +102,7 @@ typedef struct raidz_col {
        uint64_t rc_offset;             /* device offset */
        uint64_t rc_size;               /* I/O size */
        void *rc_data;                  /* I/O data */
+       void *rc_gdata;                 /* used to store the "good" version */
        int rc_error;                   /* I/O error for this device */
        uint8_t rc_tried;               /* Did we attempt this I/O column? */
        uint8_t rc_skipped;             /* Did we skip this I/O column? */
@@ -116,14 +116,18 @@ typedef struct raidz_map {
        uint64_t rm_missingdata;        /* Count of missing data devices */
        uint64_t rm_missingparity;      /* Count of missing parity devices */
        uint64_t rm_firstdatacol;       /* First data column/parity count */
-       uint64_t rm_skipped;            /* Skipped sectors for padding */
+       uint64_t rm_nskip;              /* Skipped sectors for padding */
+       uint64_t rm_skipstart;  /* Column index of padding start */
+       void *rm_datacopy;              /* rm_asize-buffer of copied data */
+       uintptr_t rm_reports;           /* # of referencing checksum reports */
+       uint8_t rm_freed;               /* map no longer has referencing ZIO */
+       uint8_t rm_ecksuminjected;      /* checksum error was injected */
        raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define        VDEV_RAIDZ_P            0
 #define        VDEV_RAIDZ_Q            1
 #define        VDEV_RAIDZ_R            2
-#define        VDEV_RAIDZ_MAXPARITY    3
 
 #define        VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define        VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
@@ -226,6 +230,8 @@ static const uint8_t vdev_raidz_log2[256] = {
        0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
+static void vdev_raidz_generate_parity(raidz_map_t *rm);
+
 /*
  * Multiply a given number by 2 raised to the given power.
  */
@@ -246,17 +252,184 @@ vdev_raidz_exp2(uint_t a, int exp)
 }
 
 static void
-vdev_raidz_map_free(zio_t *zio)
+vdev_raidz_map_free(raidz_map_t *rm)
 {
-       raidz_map_t *rm = zio->io_vsd;
        int c;
+       size_t size;
 
-       for (c = 0; c < rm->rm_firstdatacol; c++)
+       for (c = 0; c < rm->rm_firstdatacol; c++) {
                zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
+               if (rm->rm_col[c].rc_gdata != NULL)
+                       zio_buf_free(rm->rm_col[c].rc_gdata,
+                           rm->rm_col[c].rc_size);
+       }
+
+       size = 0;
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+               size += rm->rm_col[c].rc_size;
+
+       if (rm->rm_datacopy != NULL)
+               zio_buf_free(rm->rm_datacopy, size);
+
        kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+       raidz_map_t *rm = zio->io_vsd;
+
+       ASSERT3U(rm->rm_freed, ==, 0);
+       rm->rm_freed = 1;
+
+       if (rm->rm_reports == 0)
+               vdev_raidz_map_free(rm);
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+       raidz_map_t *rm = arg;
+
+       ASSERT3U(rm->rm_reports, >, 0);
+
+       if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+               vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
+{
+       raidz_map_t *rm = zcr->zcr_cbdata;
+       size_t c = zcr->zcr_cbinfo;
+       size_t x;
+
+       const char *good = NULL;
+       const char *bad = rm->rm_col[c].rc_data;
+
+       if (good_data == NULL) {
+               zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+               return;
+       }
+
+       if (c < rm->rm_firstdatacol) {
+               /*
+                * The first time through, calculate the parity blocks for
+                * the good data (this relies on the fact that the good
+                * data never changes for a given logical ZIO)
+                */
+               if (rm->rm_col[0].rc_gdata == NULL) {
+                       char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+                       char *buf;
+
+                       /*
+                        * Set up the rm_col[]s to generate the parity for
+                        * good_data, first saving the parity bufs and
+                        * replacing them with buffers to hold the result.
+                        */
+                       for (x = 0; x < rm->rm_firstdatacol; x++) {
+                               bad_parity[x] = rm->rm_col[x].rc_data;
+                               rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+                                   zio_buf_alloc(rm->rm_col[x].rc_size);
+                       }
+
+                       /* fill in the data columns from good_data */
+                       buf = (char *)good_data;
+                       for (; x < rm->rm_cols; x++) {
+                               rm->rm_col[x].rc_data = buf;
+                               buf += rm->rm_col[x].rc_size;
+                       }
+
+                       /*
+                        * Construct the parity from the good data.
+                        */
+                       vdev_raidz_generate_parity(rm);
+
+                       /* restore everything back to its original state */
+                       for (x = 0; x < rm->rm_firstdatacol; x++)
+                               rm->rm_col[x].rc_data = bad_parity[x];
+
+                       buf = rm->rm_datacopy;
+                       for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
+                               rm->rm_col[x].rc_data = buf;
+                               buf += rm->rm_col[x].rc_size;
+                       }
+               }
+
+               ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
+               good = rm->rm_col[c].rc_gdata;
+       } else {
+               /* adjust good_data to point at the start of our column */
+               good = good_data;
+
+               for (x = rm->rm_firstdatacol; x < c; x++)
+                       good += rm->rm_col[x].rc_size;
+       }
+
+       /* we drop the ereport if it ends up that the data was good */
+       zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely.  The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+       size_t c = (size_t)(uintptr_t)arg;
+       caddr_t buf;
+
+       raidz_map_t *rm = zio->io_vsd;
+       size_t size;
+
+       /* set up the report and bump the refcount  */
+       zcr->zcr_cbdata = rm;
+       zcr->zcr_cbinfo = c;
+       zcr->zcr_finish = vdev_raidz_cksum_finish;
+       zcr->zcr_free = vdev_raidz_cksum_free;
+
+       rm->rm_reports++;
+       ASSERT3U(rm->rm_reports, >, 0);
+
+       if (rm->rm_datacopy != NULL)
+               return;
+
+       /*
+        * It's the first time we're called for this raidz_map_t, so we need
+        * to copy the data aside; there's no guarantee that our zio's buffer
+        * won't be re-used for something else.
+        *
+        * Our parity data is already in separate buffers, so there's no need
+        * to copy them.
+        */
+
+       size = 0;
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+               size += rm->rm_col[c].rc_size;
+
+       buf = rm->rm_datacopy = zio_buf_alloc(size);
+
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+               raidz_col_t *col = &rm->rm_col[c];
+
+               bcopy(col->rc_data, buf, col->rc_size);
+               col->rc_data = buf;
+
+               buf += col->rc_size;
+       }
+       ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+}
+
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+       vdev_raidz_map_free_vsd,
+       vdev_raidz_cksum_report
+};
+
 static raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
     uint64_t nparity)
@@ -288,9 +461,14 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
        rm->rm_cols = acols;
        rm->rm_scols = scols;
        rm->rm_bigcols = bc;
+       rm->rm_skipstart = bc;
        rm->rm_missingdata = 0;
        rm->rm_missingparity = 0;
        rm->rm_firstdatacol = nparity;
+       rm->rm_datacopy = NULL;
+       rm->rm_reports = 0;
+       rm->rm_freed = 0;
+       rm->rm_ecksuminjected = 0;
 
        asize = 0;
 
@@ -304,6 +482,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                rm->rm_col[c].rc_devidx = col;
                rm->rm_col[c].rc_offset = coff;
                rm->rm_col[c].rc_data = NULL;
+               rm->rm_col[c].rc_gdata = NULL;
                rm->rm_col[c].rc_error = 0;
                rm->rm_col[c].rc_tried = 0;
                rm->rm_col[c].rc_skipped = 0;
@@ -320,9 +499,9 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 
        ASSERT3U(asize, ==, tot << unit_shift);
        rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
-       rm->rm_skipped = roundup(tot, nparity + 1) - tot;
-       ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
-       ASSERT3U(rm->rm_skipped, <=, nparity);
+       rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+       ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+       ASSERT3U(rm->rm_nskip, <=, nparity);
 
        for (c = 0; c < rm->rm_firstdatacol; c++)
                rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -347,6 +526,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
         * Unfortunately, this decision created an implicit on-disk format
         * requirement that we need to support for all eternity, but only
         * for single-parity RAID-Z.
+        *
+        * If we intend to skip a sector in the zeroth column for padding
+        * we must make sure to note this swap. We will never intend to
+        * skip the first column since at least one data and one parity
+        * column must appear in each row.
         */
        ASSERT(rm->rm_cols >= 2);
        ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
@@ -358,10 +542,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
                rm->rm_col[1].rc_devidx = devidx;
                rm->rm_col[1].rc_offset = o;
+
+               if (rm->rm_skipstart == 0)
+                       rm->rm_skipstart = 1;
        }
 
        zio->io_vsd = rm;
-       zio->io_vsd_free = vdev_raidz_map_free;
+       zio->io_vsd_ops = &vdev_raidz_vsd_ops;
        return (rm);
 }
 
@@ -1335,7 +1522,6 @@ vdev_raidz_io_start(zio_t *zio)
        vdev_t *vd = zio->io_vd;
        vdev_t *tvd = vd->vdev_top;
        vdev_t *cvd;
-       blkptr_t *bp = zio->io_bp;
        raidz_map_t *rm;
        raidz_col_t *rc;
        int c, i;
@@ -1361,7 +1547,7 @@ vdev_raidz_io_start(zio_t *zio)
                 * Generate optional I/Os for any skipped sectors to improve
                 * aggregation contiguity.
                 */
-               for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
+               for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
                        ASSERT(c <= rm->rm_scols);
                        if (c == rm->rm_scols)
                                c = 0;
@@ -1396,7 +1582,7 @@ vdev_raidz_io_start(zio_t *zio)
                        rc->rc_skipped = 1;
                        continue;
                }
-               if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
+               if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
                        if (c >= rm->rm_firstdatacol)
                                rm->rm_missingdata++;
                        else
@@ -1417,23 +1603,47 @@ vdev_raidz_io_start(zio_t *zio)
        return (ZIO_PIPELINE_CONTINUE);
 }
 
+
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
        vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
        if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+               zio_bad_cksum_t zbc;
+               raidz_map_t *rm = zio->io_vsd;
+
                mutex_enter(&vd->vdev_stat_lock);
                vd->vdev_stat.vs_checksum_errors++;
                mutex_exit(&vd->vdev_stat_lock);
+
+               zbc.zbc_has_cksum = 0;
+               zbc.zbc_injected = rm->rm_ecksuminjected;
+
+               zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+                   rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+                   &zbc);
        }
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+       zio_bad_cksum_t zbc;
+       raidz_map_t *rm = zio->io_vsd;
+
+       int ret = zio_checksum_error(zio, &zbc);
+       if (ret != 0 && zbc.zbc_injected != 0)
+               rm->rm_ecksuminjected = 1;
 
-       if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
-               zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-                   zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+       return (ret);
 }
 
 /*
@@ -1464,7 +1674,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
                if (!rc->rc_tried || rc->rc_error != 0)
                        continue;
                if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
-                       raidz_checksum_error(zio, rc);
+                       raidz_checksum_error(zio, rc, orig[c]);
                        rc->rc_error = ECKSUM;
                        ret++;
                }
@@ -1579,7 +1789,7 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
                         * success.
                         */
                        code = vdev_raidz_reconstruct(rm, tgts, n);
-                       if (zio_checksum_error(zio) == 0) {
+                       if (raidz_checksum_verify(zio) == 0) {
                                atomic_inc_64(&raidz_corrected[code]);
 
                                for (i = 0; i < n; i++) {
@@ -1587,7 +1797,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
                                        rc = &rm->rm_col[c];
                                        ASSERT(rc->rc_error == 0);
                                        if (rc->rc_tried)
-                                               raidz_checksum_error(zio, rc);
+                                               raidz_checksum_error(zio, rc,
+                                                   orig[i]);
                                        rc->rc_error = ECKSUM;
                                }
 
@@ -1724,7 +1935,7 @@ vdev_raidz_io_done(zio_t *zio)
         */
        if (total_errors <= rm->rm_firstdatacol - parity_untried) {
                if (data_errors == 0) {
-                       if (zio_checksum_error(zio) == 0) {
+                       if (raidz_checksum_verify(zio) == 0) {
                                /*
                                 * If we read parity information (unnecessarily
                                 * as it happens since no reconstruction was
@@ -1770,7 +1981,7 @@ vdev_raidz_io_done(zio_t *zio)
 
                        code = vdev_raidz_reconstruct(rm, tgts, n);
 
-                       if (zio_checksum_error(zio) == 0) {
+                       if (raidz_checksum_verify(zio) == 0) {
                                atomic_inc_64(&raidz_corrected[code]);
 
                                /*
@@ -1839,18 +2050,11 @@ vdev_raidz_io_done(zio_t *zio)
         * reconstruction over all possible combinations. If that fails,
         * we're cooked.
         */
-       if (total_errors >= rm->rm_firstdatacol) {
+       if (total_errors > rm->rm_firstdatacol) {
                zio->io_error = vdev_raidz_worst_error(rm);
-               /*
-                * If there were exactly as many device errors as parity
-                * columns, yet we couldn't reconstruct the data, then at
-                * least one device must have returned bad data silently.
-                */
-               if (total_errors == rm->rm_firstdatacol)
-                       zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
 
-       } else if ((code = vdev_raidz_combrec(zio, total_errors,
-           data_errors)) != 0) {
+       } else if (total_errors < rm->rm_firstdatacol &&
+           (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
                /*
                 * If we didn't use all the available parity for the
                 * combinatorial reconstruction, verify that the remaining
@@ -1860,17 +2064,34 @@ vdev_raidz_io_done(zio_t *zio)
                        (void) raidz_parity_verify(zio, rm);
        } else {
                /*
-                * All combinations failed to checksum. Generate checksum
-                * ereports for all children.
+                * We're here because either:
+                *
+                *      total_errors == rm_first_datacol, or
+                *      vdev_raidz_combrec() failed
+                *
+                * In either case, there is enough bad data to prevent
+                * reconstruction.
+                *
+                * Start checksum ereports for all children which haven't
+                * failed, and the IO wasn't speculative.
                 */
                zio->io_error = ECKSUM;
 
                if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
                        for (c = 0; c < rm->rm_cols; c++) {
                                rc = &rm->rm_col[c];
-                               zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-                                   zio->io_spa, vd->vdev_child[rc->rc_devidx],
-                                   zio, rc->rc_offset, rc->rc_size);
+                               if (rc->rc_error == 0) {
+                                       zio_bad_cksum_t zbc;
+                                       zbc.zbc_has_cksum = 0;
+                                       zbc.zbc_injected =
+                                           rm->rm_ecksuminjected;
+
+                                       zfs_ereport_start_checksum(
+                                           zio->io_spa,
+                                           vd->vdev_child[rc->rc_devidx],
+                                           zio, rc->rc_offset, rc->rc_size,
+                                           (void *)(uintptr_t)c, &zbc);
+                               }
                        }
                }
        }
@@ -1918,6 +2139,8 @@ vdev_ops_t vdev_raidz_ops = {
        vdev_raidz_io_start,
        vdev_raidz_io_done,
        vdev_raidz_state_change,
+       NULL,
+       NULL,
        VDEV_TYPE_RAIDZ,        /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
index 524c8e6..879f78f 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -109,6 +109,8 @@ vdev_ops_t vdev_root_ops = {
        NULL,                   /* io_start - not applicable to the root */
        NULL,                   /* io_done - not applicable to the root */
        vdev_root_state_change,
+       NULL,
+       NULL,
        VDEV_TYPE_ROOT,         /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
index 2dc2705..288a4d9 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -70,7 +69,7 @@ fzap_byteswap(void *vbuf, size_t size)
 }
 
 void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 {
        dmu_buf_t *db;
        zap_leaf_t *l;
@@ -102,6 +101,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
        zp->zap_num_entries = 0;
        zp->zap_salt = zap->zap_salt;
        zp->zap_normflags = zap->zap_normflags;
+       zp->zap_flags = flags;
 
        /* block 1 will be the first leaf */
        for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -111,7 +111,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
         * set up block 1 - the first leaf
         */
        VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+           1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
        dmu_buf_will_dirty(db, tx);
 
        l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
@@ -172,20 +172,20 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 
        b = tbl->zt_blks_copied;
        err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           (tbl->zt_blk + b) << bs, FTAG, &db_old);
+           (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
        if (err)
                return (err);
 
        /* first half of entries in old[b] go to new[2*b+0] */
        VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           (newblk + 2*b+0) << bs, FTAG, &db_new));
+           (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
        dmu_buf_will_dirty(db_new, tx);
        transfer_func(db_old->db_data, db_new->db_data, hepb);
        dmu_buf_rele(db_new, FTAG);
 
        /* second half of entries in old[b] go to new[2*b+1] */
        VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           (newblk + 2*b+1) << bs, FTAG, &db_new));
+           (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
        dmu_buf_will_dirty(db_new, tx);
        transfer_func((uint64_t *)db_old->db_data + hepb,
            db_new->db_data, hepb);
@@ -233,7 +233,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
        off = idx & ((1<<(bs-3))-1);
 
        err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           (tbl->zt_blk + blk) << bs, FTAG, &db);
+           (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
        if (err)
                return (err);
        dmu_buf_will_dirty(db, tx);
@@ -245,7 +245,8 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
                dmu_buf_t *db2;
 
                err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-                   (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+                   (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+                   DMU_READ_NO_PREFETCH);
                if (err) {
                        dmu_buf_rele(db, FTAG);
                        return (err);
@@ -276,7 +277,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
        off = idx & ((1<<(bs-3))-1);
 
        err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           (tbl->zt_blk + blk) << bs, FTAG, &db);
+           (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
        if (err)
                return (err);
        *valp = ((uint64_t *)db->db_data)[off];
@@ -291,7 +292,8 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
                blk = (idx*2) >> (bs-3);
 
                err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-                   (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+                   (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+                   DMU_READ_NO_PREFETCH);
                dmu_buf_rele(db, FTAG);
        }
        return (err);
@@ -315,8 +317,13 @@ zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 static int
 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 {
-       /* In case things go horribly wrong. */
-       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+       /*
+        * The pointer table should never use more hash bits than we
+        * have (otherwise we'd be using useless zero bits to index it).
+        * If we are within 2 bits of running out, stop growing, since
+        * this is already an aberrant condition.
+        */
+       if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
                return (ENOSPC);
 
        if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
@@ -335,7 +342,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 
                newblk = zap_allocate_blocks(zap, 1);
                err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-                   newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+                   newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+                   DMU_READ_NO_PREFETCH);
                if (err)
                        return (err);
                dmu_buf_will_dirty(db_new, tx);
@@ -393,7 +401,8 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
        l->l_phys = NULL;
 
        VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
+           l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+           DMU_READ_NO_PREFETCH));
        winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
        ASSERT(winner == NULL);
        dmu_buf_will_dirty(l->l_dbuf, tx);
@@ -496,7 +505,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
        ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
        err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-           blkid << bs, NULL, &db);
+           blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
        if (err)
                return (err);
 
@@ -700,13 +709,17 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
        }
 }
 
-
 static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+fzap_checkname(zap_name_t *zn)
 {
-       if (name && strlen(name) > ZAP_MAXNAMELEN)
-               return (E2BIG);
+       if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+               return (ENAMETOOLONG);
+       return (0);
+}
 
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
        /* Only integer sizes supported by C */
        switch (integer_size) {
        case 1:
@@ -724,6 +737,16 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
        return (0);
 }
 
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+       int err;
+
+       if ((err = fzap_checkname(zn)) != 0)
+               return (err);
+       return (fzap_checksize(integer_size, num_integers));
+}
+
 /*
  * Routines for manipulating attributes.
  */
@@ -736,8 +759,7 @@ fzap_lookup(zap_name_t *zn,
        int err;
        zap_entry_handle_t zeh;
 
-       err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
-       if (err != 0)
+       if ((err = fzap_checkname(zn)) != 0)
                return (err);
 
        err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
@@ -745,8 +767,13 @@ fzap_lookup(zap_name_t *zn,
                return (err);
        err = zap_leaf_lookup(l, zn, &zeh);
        if (err == 0) {
+               if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+                       zap_put_leaf(l);
+                       return (err);
+               }
+
                err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-               (void) zap_entry_read_name(&zeh, rn_len, realname);
+               (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
                if (ncp) {
                        *ncp = zap_entry_normalization_conflict(&zeh,
                            zn, NULL, zn->zn_zap);
@@ -769,8 +796,7 @@ fzap_add_cd(zap_name_t *zn,
 
        ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
        ASSERT(!zap->zap_ismicro);
-       ASSERT(fzap_checksize(zn->zn_name_orij,
-           integer_size, num_integers) == 0);
+       ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 
        err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
        if (err != 0)
@@ -784,7 +810,7 @@ retry:
        if (err != ENOENT)
                goto out;
 
-       err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+       err = zap_entry_create(l, zn, cd,
            integer_size, num_integers, val, &zeh);
 
        if (err == 0) {
@@ -807,12 +833,12 @@ fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
-       int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+       int err = fzap_check(zn, integer_size, num_integers);
        if (err != 0)
                return (err);
 
        return (fzap_add_cd(zn, integer_size, num_integers,
-           val, ZAP_MAXCD, tx));
+           val, ZAP_NEED_CD, tx));
 }
 
 int
@@ -825,7 +851,7 @@ fzap_update(zap_name_t *zn,
        zap_t *zap = zn->zn_zap;
 
        ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-       err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+       err = fzap_check(zn, integer_size, num_integers);
        if (err != 0)
                return (err);
 
@@ -838,8 +864,8 @@ retry:
        ASSERT(err == 0 || err == ENOENT);
 
        if (create) {
-               err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
-                   ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+               err = zap_entry_create(l, zn, ZAP_NEED_CD,
+                   integer_size, num_integers, val, &zeh);
                if (err == 0)
                        zap_increment_num_entries(zap, 1, tx);
        } else {
@@ -901,6 +927,21 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
        return (err);
 }
 
+void
+fzap_prefetch(zap_name_t *zn)
+{
+       uint64_t idx, blk;
+       zap_t *zap = zn->zn_zap;
+       int bs;
+
+       idx = ZAP_HASH_IDX(zn->zn_hash,
+           zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+       if (zap_idx_to_blk(zap, idx, &blk) != 0)
+               return;
+       bs = FZAP_BLOCK_SHIFT(zap);
+       dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+}
+
 /*
  * Helper functions for consumers.
  */
@@ -952,6 +993,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 }
 
 int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       int err;
+
+       for (zap_cursor_init(&zc, os, fromobj);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           (void) zap_cursor_advance(&zc)) {
+               if (za.za_integer_length != 8 || za.za_num_integers != 1)
+                       return (EINVAL);
+               err = zap_add(os, intoobj, za.za_name,
+                   8, 1, &value, tx);
+               if (err)
+                       return (err);
+       }
+       zap_cursor_fini(&zc);
+       return (0);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       int err;
+
+       for (zap_cursor_init(&zc, os, fromobj);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           (void) zap_cursor_advance(&zc)) {
+               uint64_t delta = 0;
+
+               if (za.za_integer_length != 8 || za.za_num_integers != 1)
+                       return (EINVAL);
+
+               err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+               if (err != 0 && err != ENOENT)
+                       return (err);
+               delta += za.za_first_integer;
+               err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
+               if (err)
+                       return (err);
+       }
+       zap_cursor_fini(&zc);
+       return (0);
+}
+
+int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
        char name[20];
@@ -978,6 +1069,56 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
        return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
+int
+zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+       char name[20];
+
+       (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+       return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+       char name[20];
+
+       (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+       return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx)
+{
+       uint64_t value = 0;
+       int err;
+
+       if (delta == 0)
+               return (0);
+
+       err = zap_lookup(os, obj, name, 8, 1, &value);
+       if (err != 0 && err != ENOENT)
+               return (err);
+       value += delta;
+       if (value == 0)
+               err = zap_remove(os, obj, name, tx);
+       else
+               err = zap_update(os, obj, name, 8, 1, &value, tx);
+       return (err);
+}
+
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+       char name[20];
+
+       (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+       return (zap_increment(os, obj, name, delta, tx));
+}
+
 /*
  * Routines for iterating over the attributes.
  */
@@ -1039,7 +1180,7 @@ again:
                        err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
                        ASSERT(err == 0 || err == EOVERFLOW);
                }
-               err = zap_entry_read_name(&zeh,
+               err = zap_entry_read_name(zap, &zeh,
                    sizeof (za->za_name), za->za_name);
                ASSERT(err == 0);
 
@@ -1051,7 +1192,6 @@ again:
        return (err);
 }
 
-
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
@@ -1078,6 +1218,31 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
        }
 }
 
+int
+fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
+{
+       int err;
+       zap_leaf_t *l;
+       zap_entry_handle_t zeh;
+
+       if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+               return (ENAMETOOLONG);
+
+       err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
+       if (err != 0)
+               return (err);
+
+       err = zap_leaf_lookup(l, zn, &zeh);
+       if (err != 0)
+               return (err);
+
+       zc->zc_leaf = l;
+       zc->zc_hash = zeh.zeh_hash;
+       zc->zc_cd = zeh.zeh_cd;
+
+       return (err);
+}
+
 void
 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 {
@@ -1123,7 +1288,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 
                        err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
                            (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
-                           FTAG, &db);
+                           FTAG, &db, DMU_READ_NO_PREFETCH);
                        if (err == 0) {
                                zap_stats_ptrtbl(zap, db->db_data,
                                    1<<(bs-3), zs);
@@ -1145,9 +1310,9 @@ fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
         * Account for the header block of the fatzap.
         */
        if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
-               tooverwrite += zap->zap_dbuf->db_size;
+               *tooverwrite += zap->zap_dbuf->db_size;
        } else {
-               towrite += zap->zap_dbuf->db_size;
+               *towrite += zap->zap_dbuf->db_size;
        }
 
        /*
@@ -1160,9 +1325,9 @@ fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
         */
        if (add) {
                if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
-                       towrite += zap->zap_dbuf->db_size;
+                       *towrite += zap->zap_dbuf->db_size;
                else
-                       towrite += (zap->zap_dbuf->db_size * 3);
+                       *towrite += (zap->zap_dbuf->db_size * 3);
        }
 
        /*
@@ -1175,13 +1340,13 @@ fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
        }
 
        if (!add && dmu_buf_freeable(l->l_dbuf)) {
-               tooverwrite += l->l_dbuf->db_size;
+               *tooverwrite += l->l_dbuf->db_size;
        } else {
                /*
                 * If this an add operation, the leaf block could split.
                 * Hence, we need to account for an additional leaf block.
                 */
-               towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+               *towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
        }
 
        zap_put_leaf(l);
index 9d8354e..19a795d 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -29,6 +28,7 @@
  * the names are stored null-terminated.
  */
 
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
@@ -36,6 +36,7 @@
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
+#include <sys/arc.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
@@ -126,12 +127,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
                        le = &lc->l_entry;
 
                        le->le_type =           BSWAP_8(le->le_type);
-                       le->le_int_size =       BSWAP_8(le->le_int_size);
+                       le->le_value_intlen =   BSWAP_8(le->le_value_intlen);
                        le->le_next =           BSWAP_16(le->le_next);
                        le->le_name_chunk =     BSWAP_16(le->le_name_chunk);
-                       le->le_name_length =    BSWAP_16(le->le_name_length);
+                       le->le_name_numints =   BSWAP_16(le->le_name_numints);
                        le->le_value_chunk =    BSWAP_16(le->le_value_chunk);
-                       le->le_value_length =   BSWAP_16(le->le_value_length);
+                       le->le_value_numints =  BSWAP_16(le->le_value_numints);
                        le->le_cd =             BSWAP_32(le->le_cd);
                        le->le_hash =           BSWAP_64(le->le_hash);
                        break;
@@ -214,7 +215,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 
 static uint16_t
 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
-       int integer_size, int num_integers)
+    int integer_size, int num_integers)
 {
        uint16_t chunk_head;
        uint16_t *chunkp = &chunk_head;
@@ -272,11 +273,12 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 static void
 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
-    char *buf)
+    void *buf)
 {
        int len = MIN(array_len, buf_len);
        int byten = 0;
        uint64_t value = 0;
+       char *p = buf;
 
        ASSERT3U(array_int_len, <=, buf_int_len);
 
@@ -284,7 +286,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
        if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
                struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
                uint8_t *ip = la->la_array;
-               uint64_t *buf64 = (uint64_t *)buf;
+               uint64_t *buf64 = buf;
 
                *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
                    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
@@ -299,8 +301,8 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
                while (chunk != CHAIN_END) {
                        struct zap_leaf_array *la =
                            &ZAP_LEAF_CHUNK(l, chunk).l_array;
-                       bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
-                       buf += ZAP_LEAF_ARRAY_BYTES;
+                       bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+                       p += ZAP_LEAF_ARRAY_BYTES;
                        chunk = la->la_next;
                }
                return;
@@ -315,50 +317,69 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
                        value = (value << 8) | la->la_array[i];
                        byten++;
                        if (byten == array_int_len) {
-                               stv(buf_int_len, buf, value);
+                               stv(buf_int_len, p, value);
                                byten = 0;
                                len--;
                                if (len == 0)
                                        return;
-                               buf += buf_int_len;
+                               p += buf_int_len;
                        }
                }
                chunk = la->la_next;
        }
 }
 
-/*
- * Only to be used on 8-bit arrays.
- * array_len is actual len in bytes (not encoded le_value_length).
- * namenorm is null-terminated.
- */
 static boolean_t
-zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+    int chunk, int array_numints)
 {
        int bseen = 0;
 
+       if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+               uint64_t *thiskey;
+               boolean_t match;
+
+               ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+               thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
+                   KM_SLEEP);
+
+               zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+                   sizeof (*thiskey), array_numints, thiskey);
+               match = bcmp(thiskey, zn->zn_key_orig,
+                   array_numints * sizeof (*thiskey)) == 0;
+               kmem_free(thiskey, array_numints * sizeof (*thiskey));
+               return (match);
+       }
+
+       ASSERT(zn->zn_key_intlen == 1);
        if (zn->zn_matchtype == MT_FIRST) {
-               char *thisname = kmem_alloc(array_len, KM_SLEEP);
+               char *thisname = kmem_alloc(array_numints, KM_SLEEP);
                boolean_t match;
 
-               zap_leaf_array_read(l, chunk, 1, array_len, 1,
-                   array_len, thisname);
+               zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+                   sizeof (char), array_numints, thisname);
                match = zap_match(zn, thisname);
-               kmem_free(thisname, array_len);
+               kmem_free(thisname, array_numints);
                return (match);
        }
 
-       /* Fast path for exact matching */
-       while (bseen < array_len) {
+       /*
+        * Fast path for exact matching.
+        * First check that the lengths match, so that we don't read
+        * past the end of the zn_key_orig array.
+        */
+       if (array_numints != zn->zn_key_orig_numints)
+               return (B_FALSE);
+       while (bseen < array_numints) {
                struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-               int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+               int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
                ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-               if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+               if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
                        break;
                chunk = la->la_next;
                bseen += toread;
        }
-       return (bseen == array_len);
+       return (bseen == array_numints);
 }
 
 /*
@@ -393,9 +414,9 @@ again:
                ASSERT(zn->zn_matchtype == MT_EXACT ||
                    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
                if (zap_leaf_array_match(l, zn, le->le_name_chunk,
-                   le->le_name_length)) {
-                       zeh->zeh_num_integers = le->le_value_length;
-                       zeh->zeh_integer_size = le->le_int_size;
+                   le->le_name_numints)) {
+                       zeh->zeh_num_integers = le->le_value_numints;
+                       zeh->zeh_integer_size = le->le_value_intlen;
                        zeh->zeh_cd = le->le_cd;
                        zeh->zeh_hash = le->le_hash;
                        zeh->zeh_chunkp = chunkp;
@@ -426,7 +447,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
 {
        uint16_t chunk;
        uint64_t besth = -1ULL;
-       uint32_t bestcd = ZAP_MAXCD;
+       uint32_t bestcd = -1U;
        uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
        uint16_t lh;
        struct zap_leaf_entry *le;
@@ -448,8 +469,8 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
                                besth = le->le_hash;
                                bestcd = le->le_cd;
 
-                               zeh->zeh_num_integers = le->le_value_length;
-                               zeh->zeh_integer_size = le->le_int_size;
+                               zeh->zeh_num_integers = le->le_value_numints;
+                               zeh->zeh_integer_size = le->le_value_intlen;
                                zeh->zeh_cd = le->le_cd;
                                zeh->zeh_hash = le->le_hash;
                                zeh->zeh_fakechunk = chunk;
@@ -459,7 +480,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
                }
        }
 
-       return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+       return (bestcd == -1U ? ENOENT : 0);
 }
 
 int
@@ -470,11 +491,12 @@ zap_entry_read(const zap_entry_handle_t *zeh,
            ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
        ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-       if (le->le_int_size > integer_size)
+       if (le->le_value_intlen > integer_size)
                return (EINVAL);
 
-       zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
-           le->le_value_length, integer_size, num_integers, buf);
+       zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+           le->le_value_intlen, le->le_value_numints,
+           integer_size, num_integers, buf);
 
        if (zeh->zeh_num_integers > num_integers)
                return (EOVERFLOW);
@@ -483,15 +505,21 @@ zap_entry_read(const zap_entry_handle_t *zeh,
 }
 
 int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+    char *buf)
 {
        struct zap_leaf_entry *le =
            ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
        ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-       zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
-           le->le_name_length, 1, buflen, buf);
-       if (le->le_name_length > buflen)
+       if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+               zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+                   le->le_name_numints, 8, buflen / 8, buf);
+       } else {
+               zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+                   le->le_name_numints, 1, buflen, buf);
+       }
+       if (le->le_name_numints > buflen)
                return (EOVERFLOW);
        return (0);
 }
@@ -505,24 +533,16 @@ zap_entry_update(zap_entry_handle_t *zeh,
        struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
 
        delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
-           ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+           ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
        if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
                return (EAGAIN);
 
-       /*
-        * We should search other chained leaves (via
-        * zap_entry_remove,create?) otherwise returning EAGAIN will
-        * just send us into an infinite loop if we have to chain
-        * another leaf block, rather than being able to split this
-        * block.
-        */
-
        zap_leaf_array_free(l, &le->le_value_chunk);
        le->le_value_chunk =
            zap_leaf_array_create(l, buf, integer_size, num_integers);
-       le->le_value_length = num_integers;
-       le->le_int_size = integer_size;
+       le->le_value_numints = num_integers;
+       le->le_value_intlen = integer_size;
        return (0);
 }
 
@@ -549,26 +569,25 @@ zap_entry_remove(zap_entry_handle_t *zeh)
 }
 
 int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh)
 {
        uint16_t chunk;
        uint16_t *chunkp;
        struct zap_leaf_entry *le;
-       uint64_t namelen, valuelen;
+       uint64_t valuelen;
        int numchunks;
+       uint64_t h = zn->zn_hash;
 
        valuelen = integer_size * num_integers;
-       namelen = strlen(name) + 1;
-       ASSERT(namelen >= 2);
 
-       numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
-           ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+       numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+           zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
        if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
                return (E2BIG);
 
-       if (cd == ZAP_MAXCD) {
+       if (cd == ZAP_NEED_CD) {
                /* find the lowest unused cd */
                if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
                        cd = 0;
@@ -585,7 +604,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
                        }
                } else {
                        /* old unsorted format; do it the O(n^2) way */
-                       for (cd = 0; cd < ZAP_MAXCD; cd++) {
+                       for (cd = 0; ; cd++) {
                                for (chunk = *LEAF_HASH_ENTPTR(l, h);
                                    chunk != CHAIN_END; chunk = le->le_next) {
                                        le = ZAP_LEAF_ENTRY(l, chunk);
@@ -600,10 +619,10 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
                        }
                }
                /*
-                * we would run out of space in a block before we could
-                * have ZAP_MAXCD entries
+                * We would run out of space in a block before we could
+                * store enough entries to run out of CD values.
                 */
-               ASSERT3U(cd, <, ZAP_MAXCD);
+               ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
        }
 
        if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -613,12 +632,13 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
        chunk = zap_leaf_chunk_alloc(l);
        le = ZAP_LEAF_ENTRY(l, chunk);
        le->le_type = ZAP_CHUNK_ENTRY;
-       le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
-       le->le_name_length = namelen;
+       le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+           zn->zn_key_intlen, zn->zn_key_orig_numints);
+       le->le_name_numints = zn->zn_key_orig_numints;
        le->le_value_chunk =
            zap_leaf_array_create(l, buf, integer_size, num_integers);
-       le->le_value_length = num_integers;
-       le->le_int_size = integer_size;
+       le->le_value_numints = num_integers;
+       le->le_value_intlen = integer_size;
        le->le_hash = h;
        le->le_cd = cd;
 
@@ -630,7 +650,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 
        zeh->zeh_leaf = l;
        zeh->zeh_num_integers = num_integers;
-       zeh->zeh_integer_size = le->le_int_size;
+       zeh->zeh_integer_size = le->le_value_intlen;
        zeh->zeh_cd = le->le_cd;
        zeh->zeh_hash = le->le_hash;
        zeh->zeh_chunkp = chunkp;
@@ -672,7 +692,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
                        allocdzn = B_TRUE;
                }
                if (zap_leaf_array_match(zeh->zeh_leaf, zn,
-                   le->le_name_chunk, le->le_name_length)) {
+                   le->le_name_chunk, le->le_name_numints)) {
                        if (allocdzn)
                                zap_name_free(zn);
                        return (B_TRUE);
@@ -835,9 +855,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
                        struct zap_leaf_entry *le =
                            ZAP_LEAF_ENTRY(l, chunk);
 
-                       n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
-                           ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
-                           le->le_int_size);
+                       n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+                           ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+                           le->le_value_intlen);
                        n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
                        zs->zs_entries_using_n_chunks[n]++;
 
index 528d31d..2d89c20 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
+#include <sys/arc.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
 
+uint64_t
+zap_getflags(zap_t *zap)
+{
+       if (zap->zap_ismicro)
+               return (0);
+       return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+       if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+               return (48);
+       else
+               return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+       if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+               return ((1<<16)-1);
+       else
+               return (-1U);
+}
 
 static uint64_t
-zap_hash(zap_t *zap, const char *normname)
+zap_hash(zap_name_t *zn)
 {
-       const uint8_t *cp;
-       uint8_t c;
-       uint64_t crc = zap->zap_salt;
+       zap_t *zap = zn->zn_zap;
+       uint64_t h = 0;
 
-       /* NB: name must already be normalized, if necessary */
+       if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+               ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+               h = *(uint64_t *)zn->zn_key_orig;
+       } else {
+               h = zap->zap_salt;
+               ASSERT(h != 0);
+               ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+               if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+                       int i;
+                       const uint64_t *wp = zn->zn_key_norm;
+
+                       ASSERT(zn->zn_key_intlen == 8);
+                       for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
+                               int j;
+                               uint64_t word = *wp;
+
+                               for (j = 0; j < zn->zn_key_intlen; j++) {
+                                       h = (h >> 8) ^
+                                           zfs_crc64_table[(h ^ word) & 0xFF];
+                                       word >>= NBBY;
+                               }
+                       }
+               } else {
+                       int i, len;
+                       const uint8_t *cp = zn->zn_key_norm;
 
-       ASSERT(crc != 0);
-       ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-       for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
-               crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
-       }
+                       /*
+                        * We previously stored the terminating null on
+                        * disk, but didn't hash it, so we need to
+                        * continue to not hash it.  (The
+                        * zn_key_*_numints includes the terminating
+                        * null for non-binary keys.)
+                        */
+                       len = zn->zn_key_norm_numints - 1;
 
+                       ASSERT(zn->zn_key_intlen == 1);
+                       for (i = 0; i < len; cp++, i++) {
+                               h = (h >> 8) ^
+                                   zfs_crc64_table[(h ^ *cp) & 0xFF];
+                       }
+               }
+       }
        /*
-        * Only use 28 bits, since we need 4 bits in the cookie for the
-        * collision differentiator.  We MUST use the high bits, since
-        * those are the ones that we first pay attention to when
+        * Don't use all 64 bits, since we need some in the cookie for
+        * the collision differentiator.  We MUST use the high bits,
+        * since those are the ones that we first pay attention to when
         * chosing the bucket.
         */
-       crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+       h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 
-       return (crc);
+       return (h);
 }
 
 static int
@@ -71,6 +131,8 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
        size_t inlen, outlen;
        int err;
 
+       ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
        inlen = strlen(name) + 1;
        outlen = ZAP_MAXNAMELEN;
 
@@ -85,16 +147,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
 boolean_t
 zap_match(zap_name_t *zn, const char *matchname)
 {
+       ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
        if (zn->zn_matchtype == MT_FIRST) {
                char norm[ZAP_MAXNAMELEN];
 
                if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
                        return (B_FALSE);
 
-               return (strcmp(zn->zn_name_norm, norm) == 0);
+               return (strcmp(zn->zn_key_norm, norm) == 0);
        } else {
                /* MT_BEST or MT_EXACT */
-               return (strcmp(zn->zn_name_orij, matchname) == 0);
+               return (strcmp(zn->zn_key_orig, matchname) == 0);
        }
 }
 
@@ -104,30 +168,49 @@ zap_name_free(zap_name_t *zn)
        kmem_free(zn, sizeof (zap_name_t));
 }
 
-/* XXX combine this with zap_lockdir()? */
 zap_name_t *
-zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 {
        zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
        zn->zn_zap = zap;
-       zn->zn_name_orij = name;
+       zn->zn_key_intlen = sizeof (*key);
+       zn->zn_key_orig = key;
+       zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
        zn->zn_matchtype = mt;
        if (zap->zap_normflags) {
-               if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+               if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
                        zap_name_free(zn);
                        return (NULL);
                }
-               zn->zn_name_norm = zn->zn_normbuf;
+               zn->zn_key_norm = zn->zn_normbuf;
+               zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
        } else {
                if (mt != MT_EXACT) {
                        zap_name_free(zn);
                        return (NULL);
                }
-               zn->zn_name_norm = zn->zn_name_orij;
+               zn->zn_key_norm = zn->zn_key_orig;
+               zn->zn_key_norm_numints = zn->zn_key_orig_numints;
        }
 
-       zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+       zn->zn_hash = zap_hash(zn);
+       return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+       zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+       ASSERT(zap->zap_normflags == 0);
+       zn->zn_zap = zap;
+       zn->zn_key_intlen = sizeof (*key);
+       zn->zn_key_orig = zn->zn_key_norm = key;
+       zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+       zn->zn_matchtype = MT_EXACT;
+
+       zn->zn_hash = zap_hash(zn);
        return (zn);
 }
 
@@ -172,26 +255,26 @@ mze_compare(const void *arg1, const void *arg2)
                return (+1);
        if (mze1->mze_hash < mze2->mze_hash)
                return (-1);
-       if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+       if (mze1->mze_cd > mze2->mze_cd)
                return (+1);
-       if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+       if (mze1->mze_cd < mze2->mze_cd)
                return (-1);
        return (0);
 }
 
 static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 {
        mzap_ent_t *mze;
 
        ASSERT(zap->zap_ismicro);
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-       ASSERT(mzep->mze_cd < ZAP_MAXCD);
 
        mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
        mze->mze_chunkid = chunkid;
        mze->mze_hash = hash;
-       mze->mze_phys = *mzep;
+       mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+       ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
        avl_add(&zap->zap_m.zap_avl, mze);
 }
 
@@ -206,18 +289,16 @@ mze_find(zap_name_t *zn)
        ASSERT(zn->zn_zap->zap_ismicro);
        ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
-       if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
-               return (NULL);
-
        mze_tofind.mze_hash = zn->zn_hash;
-       mze_tofind.mze_phys.mze_cd = 0;
+       mze_tofind.mze_cd = 0;
 
 again:
        mze = avl_find(avl, &mze_tofind, &idx);
        if (mze == NULL)
                mze = avl_nearest(avl, idx, AVL_AFTER);
        for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
-               if (zap_match(zn, mze->mze_phys.mze_name))
+               ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+               if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
                        return (mze);
        }
        if (zn->zn_matchtype == MT_BEST) {
@@ -240,12 +321,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
        ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
        mze_tofind.mze_hash = hash;
-       mze_tofind.mze_phys.mze_cd = 0;
+       mze_tofind.mze_cd = 0;
 
        cd = 0;
        for (mze = avl_find(avl, &mze_tofind, &idx);
            mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-               if (mze->mze_phys.mze_cd != cd)
+               if (mze->mze_cd != cd)
                        break;
                cd++;
        }
@@ -329,7 +410,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
                                zap->zap_m.zap_num_entries++;
                                zn = zap_name_alloc(zap, mze->mze_name,
                                    MT_EXACT);
-                               mze_insert(zap, i, zn->zn_hash, mze);
+                               mze_insert(zap, i, zn->zn_hash);
                                zap_name_free(zn);
                        }
                }
@@ -371,7 +452,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 
        *zapp = NULL;
 
-       err = dmu_buf_hold(os, obj, 0, NULL, &db);
+       err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
        if (err)
                return (err);
 
@@ -421,7 +502,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
                        dprintf("upgrading obj %llu: num_entries=%u\n",
                            obj, zap->zap_m.zap_num_entries);
                        *zapp = zap;
-                       return (mzap_upgrade(zapp, tx));
+                       return (mzap_upgrade(zapp, tx, 0));
                }
                err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
                ASSERT3U(err, ==, 0);
@@ -441,10 +522,11 @@ zap_unlockdir(zap_t *zap)
 }
 
 static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 {
        mzap_phys_t *mzp;
-       int i, sz, nchunks, err;
+       int i, sz, nchunks;
+       int err = 0;
        zap_t *zap = *zapp;
 
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -454,11 +536,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
        bcopy(zap->zap_dbuf->db_data, mzp, sz);
        nchunks = zap->zap_m.zap_num_chunks;
 
-       err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
-           1ULL << fzap_default_block_shift, 0, tx);
-       if (err) {
-               kmem_free(mzp, sz);
-               return (err);
+       if (!flags) {
+               err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+                   1ULL << fzap_default_block_shift, 0, tx);
+               if (err) {
+                       kmem_free(mzp, sz);
+                       return (err);
+               }
        }
 
        dprintf("upgrading obj=%llu with %u chunks\n",
@@ -466,10 +550,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
        /* XXX destroy the avl later, so we can use the stored hash value */
        mze_destroy(zap);
 
-       fzap_upgrade(zap, tx);
+       fzap_upgrade(zap, tx, flags);
 
        for (i = 0; i < nchunks; i++) {
-               int err;
                mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
                zap_name_t *zn;
                if (mze->mze_name[0] == 0)
@@ -489,12 +572,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
 }
 
 static void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+    dmu_tx_t *tx)
 {
        dmu_buf_t *db;
        mzap_phys_t *zp;
 
-       VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+       VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 #ifdef ZFS_DEBUG
        {
@@ -510,6 +594,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
        zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
        zp->mz_normflags = normflags;
        dmu_buf_rele(db, FTAG);
+
+       if (flags != 0) {
+               zap_t *zap;
+               /* Only fat zap supports flags; upgrade immediately. */
+               VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
+                   B_FALSE, B_FALSE, &zap));
+               VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
+               zap_unlockdir(zap);
+       }
 }
 
 int
@@ -530,7 +623,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
        err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
        if (err != 0)
                return (err);
-       mzap_create_impl(os, obj, normflags, tx);
+       mzap_create_impl(os, obj, normflags, 0, tx);
        return (0);
 }
 
@@ -547,7 +640,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
 {
        uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
-       mzap_create_impl(os, obj, normflags, tx);
+       mzap_create_impl(os, obj, normflags, 0, tx);
+       return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+       uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+       ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+           leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+           indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+           indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+
+       VERIFY(dmu_object_set_blocksize(os, obj,
+           1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+       mzap_create_impl(os, obj, normflags, flags, tx);
        return (obj);
 }
 
@@ -617,11 +729,11 @@ again:
            other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 
                if (zn == NULL) {
-                       zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+                       zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
                            MT_FIRST);
                        allocdzn = B_TRUE;
                }
-               if (zap_match(zn, other->mze_phys.mze_name)) {
+               if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
                        if (allocdzn)
                                zap_name_free(zn);
                        return (B_TRUE);
@@ -683,9 +795,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
                        } else if (integer_size != 8) {
                                err = EINVAL;
                        } else {
-                               *(uint64_t *)buf = mze->mze_phys.mze_value;
+                               *(uint64_t *)buf =
+                                   MZE_PHYS(zap, mze)->mze_value;
                                (void) strlcpy(realname,
-                                   mze->mze_phys.mze_name, rn_len);
+                                   MZE_PHYS(zap, mze)->mze_name, rn_len);
                                if (ncp) {
                                        *ncp = mzap_normalization_conflict(zap,
                                            zn, mze);
@@ -699,6 +812,63 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 }
 
 int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints)
+{
+       zap_t *zap;
+       int err;
+       zap_name_t *zn;
+
+       err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+       if (err)
+               return (err);
+       zn = zap_name_alloc_uint64(zap, key, key_numints);
+       if (zn == NULL) {
+               zap_unlockdir(zap);
+               return (ENOTSUP);
+       }
+
+       fzap_prefetch(zn);
+       zap_name_free(zn);
+       zap_unlockdir(zap);
+       return (err);
+}
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+       zap_t *zap;
+       int err;
+       zap_name_t *zn;
+
+       err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+       if (err)
+               return (err);
+       zn = zap_name_alloc_uint64(zap, key, key_numints);
+       if (zn == NULL) {
+               zap_unlockdir(zap);
+               return (ENOTSUP);
+       }
+
+       err = fzap_lookup(zn, integer_size, num_integers, buf,
+           NULL, 0, NULL);
+       zap_name_free(zn);
+       zap_unlockdir(zap);
+       return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+       int err = (zap_lookup_norm(os, zapobj, name, 0,
+           0, NULL, MT_EXACT, NULL, 0, NULL));
+       if (err == EOVERFLOW || err == EINVAL)
+               err = 0; /* found, but skipped reading the value */
+       return (err);
+}
+
+int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
 {
@@ -733,6 +903,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
        return (err);
 }
 
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+       zap_t *zap;
+       int err;
+       zap_name_t *zn;
+
+       err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+       if (err)
+               return (err);
+       zn = zap_name_alloc_uint64(zap, key, key_numints);
+       if (zn == NULL) {
+               zap_unlockdir(zap);
+               return (ENOTSUP);
+       }
+       err = fzap_length(zn, integer_size, num_integers);
+       zap_name_free(zn);
+       zap_unlockdir(zap);
+       return (err);
+}
+
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
@@ -741,20 +933,18 @@ mzap_addent(zap_name_t *zn, uint64_t value)
        int start = zap->zap_m.zap_alloc_next;
        uint32_t cd;
 
-       dprintf("obj=%llu %s=%llu\n", zap->zap_object,
-           zn->zn_name_orij, value);
        ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
        for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
                mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
-               ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+               ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
        }
 #endif
 
        cd = mze_find_unused_cd(zap, zn->zn_hash);
        /* given the limited size of the microzap, this can't happen */
-       ASSERT(cd != ZAP_MAXCD);
+       ASSERT(cd < zap_maxcd(zap));
 
 again:
        for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
@@ -762,13 +952,13 @@ again:
                if (mze->mze_name[0] == 0) {
                        mze->mze_value = value;
                        mze->mze_cd = cd;
-                       (void) strcpy(mze->mze_name, zn->zn_name_orij);
+                       (void) strcpy(mze->mze_name, zn->zn_key_orig);
                        zap->zap_m.zap_num_entries++;
                        zap->zap_m.zap_alloc_next = i+1;
                        if (zap->zap_m.zap_alloc_next ==
                            zap->zap_m.zap_num_chunks)
                                zap->zap_m.zap_alloc_next = 0;
-                       mze_insert(zap, i, zn->zn_hash, mze);
+                       mze_insert(zap, i, zn->zn_hash);
                        return;
                }
        }
@@ -780,7 +970,7 @@ again:
 }
 
 int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
@@ -793,7 +983,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
        err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
        if (err)
                return (err);
-       zn = zap_name_alloc(zap, name, MT_EXACT);
+       zn = zap_name_alloc(zap, key, MT_EXACT);
        if (zn == NULL) {
                zap_unlockdir(zap);
                return (ENOTSUP);
@@ -802,10 +992,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
                err = fzap_add(zn, integer_size, num_integers, val, tx);
                zap = zn->zn_zap;       /* fzap_add() may change zap */
        } else if (integer_size != 8 || num_integers != 1 ||
-           strlen(name) >= MZAP_NAME_LEN) {
-               dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-                   zapobj, integer_size, num_integers, name);
-               err = mzap_upgrade(&zn->zn_zap, tx);
+           strlen(key) >= MZAP_NAME_LEN) {
+               err = mzap_upgrade(&zn->zn_zap, tx, 0);
                if (err == 0)
                        err = fzap_add(zn, integer_size, num_integers, val, tx);
                zap = zn->zn_zap;       /* fzap_add() may change zap */
@@ -825,15 +1013,50 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
 }
 
 int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+       zap_t *zap;
+       int err;
+       zap_name_t *zn;
+
+       err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+       if (err)
+               return (err);
+       zn = zap_name_alloc_uint64(zap, key, key_numints);
+       if (zn == NULL) {
+               zap_unlockdir(zap);
+               return (ENOTSUP);
+       }
+       err = fzap_add(zn, integer_size, num_integers, val, tx);
+       zap = zn->zn_zap;       /* fzap_add() may change zap */
+       zap_name_free(zn);
+       if (zap != NULL)        /* may be NULL if fzap_add() failed */
+               zap_unlockdir(zap);
+       return (err);
+}
+
+int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
        zap_t *zap;
        mzap_ent_t *mze;
+       uint64_t oldval;
        const uint64_t *intval = val;
        zap_name_t *zn;
        int err;
 
+#ifdef ZFS_DEBUG
+       /*
+        * If there is an old value, it shouldn't change across the
+        * lockdir (eg, due to bprewrite's xlation).
+        */
+       if (integer_size == 8 && num_integers == 1)
+               (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
        err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
        if (err)
                return (err);
@@ -849,7 +1072,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
            strlen(name) >= MZAP_NAME_LEN) {
                dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
                    zapobj, integer_size, num_integers, name);
-               err = mzap_upgrade(&zn->zn_zap, tx);
+               err = mzap_upgrade(&zn->zn_zap, tx, 0);
                if (err == 0)
                        err = fzap_update(zn, integer_size, num_integers,
                            val, tx);
@@ -857,9 +1080,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
        } else {
                mze = mze_find(zn);
                if (mze != NULL) {
-                       mze->mze_phys.mze_value = *intval;
-                       zap->zap_m.zap_phys->mz_chunk
-                           [mze->mze_chunkid].mze_value = *intval;
+                       ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+                       MZE_PHYS(zap, mze)->mze_value = *intval;
                } else {
                        mzap_addent(zn, *intval);
                }
@@ -872,6 +1094,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 }
 
 int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+       zap_t *zap;
+       zap_name_t *zn;
+       int err;
+
+       err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+       if (err)
+               return (err);
+       zn = zap_name_alloc_uint64(zap, key, key_numints);
+       if (zn == NULL) {
+               zap_unlockdir(zap);
+               return (ENOTSUP);
+       }
+       err = fzap_update(zn, integer_size, num_integers, val, tx);
+       zap = zn->zn_zap;       /* fzap_update() may change zap */
+       zap_name_free(zn);
+       if (zap != NULL)        /* may be NULL if fzap_upgrade() failed */
+               zap_unlockdir(zap);
+       return (err);
+}
+
+int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
        return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
@@ -912,17 +1159,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
        return (err);
 }
 
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx)
+{
+       zap_t *zap;
+       int err;
+       zap_name_t *zn;
+
+       err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+       if (err)
+               return (err);
+       zn = zap_name_alloc_uint64(zap, key, key_numints);
+       if (zn == NULL) {
+               zap_unlockdir(zap);
+               return (ENOTSUP);
+       }
+       err = fzap_remove(zn, tx);
+       zap_name_free(zn);
+       zap_unlockdir(zap);
+       return (err);
+}
+
 /*
  * Routines for iterating over the attributes.
  */
 
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this.  So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
 void
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
@@ -931,15 +1193,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
        zc->zc_zap = NULL;
        zc->zc_leaf = NULL;
        zc->zc_zapobj = zapobj;
-       if (serialized == -1ULL) {
-               zc->zc_hash = -1ULL;
-               zc->zc_cd = 0;
-       } else {
-               zc->zc_hash = serialized << (64-ZAP_HASHBITS);
-               zc->zc_cd = serialized >> ZAP_HASHBITS;
-               if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
-                       zc->zc_cd = 0;
-       }
+       zc->zc_serialized = serialized;
+       zc->zc_hash = 0;
+       zc->zc_cd = 0;
 }
 
 void
@@ -969,10 +1225,21 @@ zap_cursor_serialize(zap_cursor_t *zc)
 {
        if (zc->zc_hash == -1ULL)
                return (-1ULL);
-       ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
-       ASSERT(zc->zc_cd < ZAP_MAXCD);
-       return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
-           ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+       if (zc->zc_zap == NULL)
+               return (zc->zc_serialized);
+       ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+       ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+       /*
+        * We want to keep the high 32 bits of the cursor zero if we can, so
+        * that 32-bit programs can access this.  So usually use a small
+        * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+        * of the cursor.
+        *
+        * [ collision differentiator | zap_hashbits()-bit hash value ]
+        */
+       return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+           ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
 int
@@ -987,10 +1254,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
                return (ENOENT);
 
        if (zc->zc_zap == NULL) {
+               int hb;
                err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
                    RW_READER, TRUE, FALSE, &zc->zc_zap);
                if (err)
                        return (err);
+
+               /*
+                * To support zap_cursor_init_serialized, advance, retrieve,
+                * we must add to the existing zc_cd, which may already
+                * be 1 due to the zap_cursor_advance.
+                */
+               ASSERT(zc->zc_hash == 0);
+               hb = zap_hashbits(zc->zc_zap);
+               zc->zc_hash = zc->zc_serialized << (64 - hb);
+               zc->zc_cd += zc->zc_serialized >> hb;
+               if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+                       zc->zc_cd = 0;
        } else {
                rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
        }
@@ -1000,7 +1280,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
                err = ENOENT;
 
                mze_tofind.mze_hash = zc->zc_hash;
-               mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+               mze_tofind.mze_cd = zc->zc_cd;
 
                mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
                if (mze == NULL) {
@@ -1008,18 +1288,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
                            idx, AVL_AFTER);
                }
                if (mze) {
-                       ASSERT(0 == bcmp(&mze->mze_phys,
-                           &zc->zc_zap->zap_m.zap_phys->mz_chunk
-                           [mze->mze_chunkid], sizeof (mze->mze_phys)));
-
+                       mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+                       ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
                        za->za_normalization_conflict =
                            mzap_normalization_conflict(zc->zc_zap, NULL, mze);
                        za->za_integer_length = 8;
                        za->za_num_integers = 1;
-                       za->za_first_integer = mze->mze_phys.mze_value;
-                       (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+                       za->za_first_integer = mzep->mze_value;
+                       (void) strcpy(za->za_name, mzep->mze_name);
                        zc->zc_hash = mze->mze_hash;
-                       zc->zc_cd = mze->mze_phys.mze_cd;
+                       zc->zc_cd = mze->mze_cd;
                        err = 0;
                } else {
                        zc->zc_hash = -1ULL;
@@ -1035,12 +1313,46 @@ zap_cursor_advance(zap_cursor_t *zc)
        if (zc->zc_hash == -1ULL)
                return;
        zc->zc_cd++;
-       if (zc->zc_cd >= ZAP_MAXCD) {
-               zc->zc_cd = 0;
-               zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
-               if (zc->zc_hash == 0) /* EOF */
-                       zc->zc_hash = -1ULL;
+}
+
+int
+zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
+{
+       int err = 0;
+       mzap_ent_t *mze;
+       zap_name_t *zn;
+
+       if (zc->zc_zap == NULL) {
+               err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+                   RW_READER, TRUE, FALSE, &zc->zc_zap);
+               if (err)
+                       return (err);
+       } else {
+               rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+       }
+
+       zn = zap_name_alloc(zc->zc_zap, name, mt);
+       if (zn == NULL) {
+               rw_exit(&zc->zc_zap->zap_rwlock);
+               return (ENOTSUP);
+       }
+
+       if (!zc->zc_zap->zap_ismicro) {
+               err = fzap_cursor_move_to_key(zc, zn);
+       } else {
+               mze = mze_find(zn);
+               if (mze == NULL) {
+                       err = ENOENT;
+                       goto out;
+               }
+               zc->zc_hash = mze->mze_hash;
+               zc->zc_cd = mze->mze_cd;
        }
+
+out:
+       zap_name_free(zn);
+       rw_exit(&zc->zc_zap->zap_rwlock);
+       return (err);
 }
 
 int
index 12ffe9f..1181bd4 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -50,6 +49,7 @@
 #include <sys/dmu.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
+#include <sys/sa.h>
 #include "fs/fs_subr.h"
 #include <acl/acl_common.h>
 
@@ -321,6 +321,82 @@ static acl_ops_t zfs_acl_fuid_ops = {
        zfs_ace_fuid_data
 };
 
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file.  Would really be nice to not need this, sigh.
+ */
+
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+       zfs_acl_phys_t acl_phys;
+
+       if (zp->z_is_sa)
+               return (0);
+
+       VERIFY(0 == sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+           &acl_phys, sizeof (acl_phys)));
+
+       return (acl_phys.z_acl_extern_obj);
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+    zfs_acl_phys_t *aclphys)
+{
+       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+       uint64_t acl_count;
+       int size;
+       int error;
+
+       if (zp->z_is_sa) {
+               if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+                   &size)) != 0)
+                       return (error);
+               *aclsize = size;
+               if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+                   &acl_count, sizeof (acl_count))) != 0)
+                       return (error);
+               *aclcount = acl_count;
+       } else {
+               if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+                   aclphys, sizeof (*aclphys))) != 0)
+                       return (error);
+
+               if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+                       *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+                       *aclcount = aclphys->z_acl_size;
+               } else {
+                       *aclsize = aclphys->z_acl_size;
+                       *aclcount = aclphys->z_acl_count;
+               }
+       }
+       return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+       zfs_acl_phys_t acl_phys;
+
+       if (zp->z_is_sa) {
+               return (ZFS_ACL_VERSION_FUID);
+       } else {
+               VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+                   SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+                   &acl_phys, sizeof (acl_phys)));
+               return (acl_phys.z_acl_version);
+       }
+}
+
 static int
 zfs_acl_version(int version)
 {
@@ -336,7 +412,7 @@ zfs_acl_version_zp(znode_t *zp)
        return (zfs_acl_version(zp->z_zfsvfs->z_version));
 }
 
-static zfs_acl_t *
+zfs_acl_t *
 zfs_acl_alloc(int vers)
 {
        zfs_acl_t *aclp;
@@ -352,7 +428,7 @@ zfs_acl_alloc(int vers)
        return (aclp);
 }
 
-static zfs_acl_node_t *
+zfs_acl_node_t *
 zfs_acl_node_alloc(size_t bytes)
 {
        zfs_acl_node_t *aclnode;
@@ -463,6 +539,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
 {
        zfs_acl_node_t *aclnode;
 
+       ASSERT(aclp);
+
        if (start == NULL) {
                aclnode = list_head(&aclp->z_acl);
                if (aclnode == NULL)
@@ -509,6 +587,7 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
                *who = aclp->z_ops.ace_who_get(acep);
                aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
                aclnode->z_ace_idx++;
+
                return ((void *)acep);
        }
        return (NULL);
@@ -542,7 +621,7 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
  */
 int
 zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
-    void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
     zfs_fuid_info_t **fuidp, cred_t *cr)
 {
        int i;
@@ -773,8 +852,9 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
  * Determine mode of file based on ACL.
  * Also, create FUIDs for any User/Group ACEs
  */
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
 {
        int             entry_type;
        mode_t          mode;
@@ -785,7 +865,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
        uint32_t        access_mask;
        boolean_t       an_exec_denied = B_FALSE;
 
-       mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+       mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
 
        while (acep = zfs_acl_next_ace(aclp, acep, &who,
            &access_mask, &iflags, &type)) {
@@ -803,7 +883,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
                    entry_type == OWNING_GROUP))
                        continue;
 
-               if (entry_type == ACE_OWNER) {
+               if (entry_type == ACE_OWNER || (entry_type == 0 &&
+                   who == fuid)) {
                        if ((access_mask & ACE_READ_DATA) &&
                            (!(seen & S_IRUSR))) {
                                seen |= S_IRUSR;
@@ -825,7 +906,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
                                        mode |= S_IXUSR;
                                }
                        }
-               } else if (entry_type == OWNING_GROUP) {
+               } else if (entry_type == OWNING_GROUP ||
+                   (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
                        if ((access_mask & ACE_READ_DATA) &&
                            (!(seen & S_IRGRP))) {
                                seen |= S_IRGRP;
@@ -930,48 +1012,13 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
                an_exec_denied = B_TRUE;
 
        if (an_exec_denied)
-               zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+               *pflags &= ~ZFS_NO_EXECS_DENIED;
        else
-               zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+               *pflags |= ZFS_NO_EXECS_DENIED;
 
        return (mode);
 }
 
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
-{
-       zfs_acl_t       *aclp;
-       zfs_acl_node_t  *aclnode;
-
-       aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-
-       /*
-        * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
-        * Version 0 didn't have a size field, only a count.
-        */
-       if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
-               aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
-               aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
-       } else {
-               aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-               aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
-       }
-
-       aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
-       aclnode->z_ace_count = aclp->z_acl_count;
-       if (will_modify) {
-               bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
-                   aclp->z_acl_bytes);
-       } else {
-               aclnode->z_size = aclp->z_acl_bytes;
-               aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
-       }
-
-       list_insert_head(&aclp->z_acl, aclnode);
-
-       return (aclp);
-}
-
 /*
  * Read an external acl object.  If the intent is to modify, always
  * create a new acl and leave any cached acl in place.
@@ -979,12 +1026,13 @@ zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
 static int
 zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 {
-       uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
        zfs_acl_t       *aclp;
-       size_t          aclsize;
-       size_t          acl_count;
+       int             aclsize;
+       int             acl_count;
        zfs_acl_node_t  *aclnode;
-       int error;
+       zfs_acl_phys_t  znode_acl;
+       int             version;
+       int             error;
 
        ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
@@ -993,48 +1041,110 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
                return (0);
        }
 
-       if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
-               *aclpp = zfs_acl_node_read_internal(zp, will_modify);
-               if (!will_modify)
-                       zp->z_acl_cached = *aclpp;
-               return (0);
-       }
+       version = ZNODE_ACL_VERSION(zp);
 
-       aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-       if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
-               zfs_acl_phys_v0_t *zacl0 =
-                   (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
+       if ((error = zfs_acl_znode_info(zp, &aclsize,
+           &acl_count, &znode_acl)) != 0)
+               return (error);
+
+       aclp = zfs_acl_alloc(version);
 
-               aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
-               acl_count = zacl0->z_acl_count;
-       } else {
-               aclsize = zp->z_phys->zp_acl.z_acl_size;
-               acl_count = zp->z_phys->zp_acl.z_acl_count;
-               if (aclsize == 0)
-                       aclsize = acl_count * sizeof (zfs_ace_t);
-       }
-       aclnode = zfs_acl_node_alloc(aclsize);
-       list_insert_head(&aclp->z_acl, aclnode);
-       error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
-           aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
-       aclnode->z_ace_count = acl_count;
        aclp->z_acl_count = acl_count;
        aclp->z_acl_bytes = aclsize;
 
+       aclnode = zfs_acl_node_alloc(aclsize);
+       aclnode->z_ace_count = aclp->z_acl_count;
+       aclnode->z_size = aclsize;
+
+       if (!zp->z_is_sa) {
+               if (znode_acl.z_acl_extern_obj) {
+                       error = dmu_read(zp->z_zfsvfs->z_os,
+                           znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+                           aclnode->z_acldata, DMU_READ_PREFETCH);
+               } else {
+                       bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+                           aclnode->z_size);
+               }
+       } else {
+               error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+                   aclnode->z_acldata, aclnode->z_size);
+       }
+
        if (error != 0) {
                zfs_acl_free(aclp);
+               zfs_acl_node_free(aclnode);
                /* convert checksum errors into IO errors */
                if (error == ECKSUM)
                        error = EIO;
                return (error);
        }
 
+       list_insert_head(&aclp->z_acl, aclnode);
+
        *aclpp = aclp;
        if (!will_modify)
                zp->z_acl_cached = aclp;
        return (0);
 }
 
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+    boolean_t start, void *userdata)
+{
+       zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+       if (start) {
+               cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+       } else {
+               cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+                   cb->cb_acl_node);
+       }
+       *dataptr = cb->cb_acl_node->z_acldata;
+       *length = cb->cb_acl_node->z_size;
+}
+
+
+static int
+zfs_acl_get_owner_fuids(znode_t *zp, uint64_t *fuid, uint64_t *fgid)
+{
+       int count = 0;
+       sa_bulk_attr_t  bulk[2];
+       int error;
+
+       if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs), NULL,
+                   &fuid, sizeof (fuid));
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs), NULL,
+                   &fgid, sizeof (fuid));
+               if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+                       return (error);
+               }
+       } else {
+               *fuid = zp->z_uid;
+               *fgid = zp->z_gid;
+       }
+       return (0);
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+       int error;
+       zfs_acl_t *aclp;
+       uint64_t fuid, fgid;
+
+       if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0)
+               return (error);
+
+       mutex_enter(&zp->z_acl_lock);
+       if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
+               zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+                   &zp->z_pflags, fuid, fgid);
+       mutex_exit(&zp->z_acl_lock);
+       return (error);
+}
+
 /*
  * common code for setting ACLs.
  *
@@ -1045,28 +1155,38 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 int
 zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 {
-       int             error;
-       znode_phys_t    *zphys = zp->z_phys;
-       zfs_acl_phys_t  *zacl = &zphys->zp_acl;
-       zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
-       uint64_t        aoid = zphys->zp_acl.z_acl_extern_obj;
-       uint64_t        off = 0;
-       dmu_object_type_t otype;
-       zfs_acl_node_t  *aclnode;
+       int                     error;
+       zfsvfs_t                *zfsvfs = zp->z_zfsvfs;
+       dmu_object_type_t       otype;
+       zfs_acl_locator_cb_t    locate = { 0 };
+       uint64_t                mode;
+       sa_bulk_attr_t          bulk[5];
+       uint64_t                ctime[2];
+       int                     count = 0;
+       uint64_t                fuid, fgid;
+
+       mode = zp->z_mode;
+
+       if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0)
+               return (error);
+
+       mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, fuid, fgid);
 
-       dmu_buf_will_dirty(zp->z_dbuf, tx);
+       zp->z_mode = mode;
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+           &mode, sizeof (mode));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, sizeof (zp->z_pflags));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+           &ctime, sizeof (ctime));
 
        if (zp->z_acl_cached) {
                zfs_acl_free(zp->z_acl_cached);
                zp->z_acl_cached = NULL;
        }
 
-       zphys->zp_mode = zfs_mode_compute(zp, aclp);
-
        /*
-        * Decide which object type to use.  If we are forced to
-        * use old ACL format then transform ACL into zfs_oldace_t
-        * layout.
+        * Upgrade needed?
         */
        if (!zfsvfs->z_use_fuids) {
                otype = DMU_OT_OLDACL;
@@ -1078,84 +1198,113 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
                otype = DMU_OT_ACL;
        }
 
-       if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-               /*
-                * If ACL was previously external and we are now
-                * converting to new ACL format then release old
-                * ACL object and create a new one.
-                */
-               if (aoid && aclp->z_version != zacl->z_acl_version) {
-                       error = dmu_object_free(zfsvfs->z_os,
-                           zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-                       if (error)
-                               return (error);
-                       aoid = 0;
-               }
-               if (aoid == 0) {
-                       aoid = dmu_object_alloc(zfsvfs->z_os,
-                           otype, aclp->z_acl_bytes,
-                           otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
-                           otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
+       /*
+        * Arrgh, we have to handle old on disk format
+        * as well as newer (preferred) SA format.
+        */
+
+       if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+               locate.cb_aclp = aclp;
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+                   zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+                   NULL, &aclp->z_acl_count, sizeof (uint64_t));
+       } else { /* Painful legacy way */
+               zfs_acl_node_t *aclnode;
+               uint64_t off = 0;
+               zfs_acl_phys_t acl_phys;
+               uint64_t aoid;
+
+               if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+                   &acl_phys, sizeof (acl_phys))) != 0)
+                       return (error);
+
+               aoid = acl_phys.z_acl_extern_obj;
+
+               if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+                       /*
+                        * If ACL was previously external and we are now
+                        * converting to new ACL format then release old
+                        * ACL object and create a new one.
+                        */
+                       if (aoid &&
+                           aclp->z_version != acl_phys.z_acl_version) {
+                               error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+                               if (error)
+                                       return (error);
+                               aoid = 0;
+                       }
+                       if (aoid == 0) {
+                               aoid = dmu_object_alloc(zfsvfs->z_os,
+                                   otype, aclp->z_acl_bytes,
+                                   otype == DMU_OT_ACL ?
+                                   DMU_OT_SYSACL : DMU_OT_NONE,
+                                   otype == DMU_OT_ACL ?
+                                   DN_MAX_BONUSLEN : 0, tx);
+                       } else {
+                               (void) dmu_object_set_blocksize(zfsvfs->z_os,
+                                   aoid, aclp->z_acl_bytes, 0, tx);
+                       }
+                       acl_phys.z_acl_extern_obj = aoid;
+                       for (aclnode = list_head(&aclp->z_acl); aclnode;
+                           aclnode = list_next(&aclp->z_acl, aclnode)) {
+                               if (aclnode->z_ace_count == 0)
+                                       continue;
+                               dmu_write(zfsvfs->z_os, aoid, off,
+                                   aclnode->z_size, aclnode->z_acldata, tx);
+                               off += aclnode->z_size;
+                       }
                } else {
-                       (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
-                           aclp->z_acl_bytes, 0, tx);
-               }
-               zphys->zp_acl.z_acl_extern_obj = aoid;
-               for (aclnode = list_head(&aclp->z_acl); aclnode;
-                   aclnode = list_next(&aclp->z_acl, aclnode)) {
-                       if (aclnode->z_ace_count == 0)
-                               continue;
-                       dmu_write(zfsvfs->z_os, aoid, off,
-                           aclnode->z_size, aclnode->z_acldata, tx);
-                       off += aclnode->z_size;
+                       void *start = acl_phys.z_ace_data;
+                       /*
+                        * Migrating back embedded?
+                        */
+                       if (acl_phys.z_acl_extern_obj) {
+                               error = dmu_object_free(zfsvfs->z_os,
+                                   acl_phys.z_acl_extern_obj, tx);
+                               if (error)
+                                       return (error);
+                               acl_phys.z_acl_extern_obj = 0;
+                       }
+
+                       for (aclnode = list_head(&aclp->z_acl); aclnode;
+                           aclnode = list_next(&aclp->z_acl, aclnode)) {
+                               if (aclnode->z_ace_count == 0)
+                                       continue;
+                               bcopy(aclnode->z_acldata, start,
+                                   aclnode->z_size);
+                               start = (caddr_t)start + aclnode->z_size;
+                       }
                }
-       } else {
-               void *start = zacl->z_ace_data;
                /*
-                * Migrating back embedded?
+                * If Old version then swap count/bytes to match old
+                * layout of znode_acl_phys_t.
                 */
-               if (zphys->zp_acl.z_acl_extern_obj) {
-                       error = dmu_object_free(zfsvfs->z_os,
-                           zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-                       if (error)
-                               return (error);
-                       zphys->zp_acl.z_acl_extern_obj = 0;
-               }
-
-               for (aclnode = list_head(&aclp->z_acl); aclnode;
-                   aclnode = list_next(&aclp->z_acl, aclnode)) {
-                       if (aclnode->z_ace_count == 0)
-                               continue;
-                       bcopy(aclnode->z_acldata, start, aclnode->z_size);
-                       start = (caddr_t)start + aclnode->z_size;
+               if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+                       acl_phys.z_acl_size = aclp->z_acl_count;
+                       acl_phys.z_acl_count = aclp->z_acl_bytes;
+               } else {
+                       acl_phys.z_acl_size = aclp->z_acl_bytes;
+                       acl_phys.z_acl_count = aclp->z_acl_count;
                }
-       }
+               acl_phys.z_acl_version = aclp->z_version;
 
-       /*
-        * If Old version then swap count/bytes to match old
-        * layout of znode_acl_phys_t.
-        */
-       if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
-               zphys->zp_acl.z_acl_size = aclp->z_acl_count;
-               zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
-       } else {
-               zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
-               zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+                   &acl_phys, sizeof (acl_phys));
        }
 
-       zphys->zp_acl.z_acl_version = aclp->z_version;
-
        /*
         * Replace ACL wide bits, but first clear them.
         */
-       zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
+       zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
 
-       zp->z_phys->zp_flags |= aclp->z_hints;
+       zp->z_pflags |= aclp->z_hints;
 
        if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
-               zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
+               zp->z_pflags |= ZFS_ACL_TRIVIAL;
 
-       return (0);
+       zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
+       return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
 }
 
 /*
@@ -1225,314 +1374,64 @@ zfs_acl_prepend_fixup(zfs_acl_t *aclp, void  *acep, void  *origacep,
        aclp->z_ops.ace_mask_set(acep, acepmask);
 }
 
-/*
- * Apply mode to canonical six ACEs.
- */
 static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
-       zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
-       void    *acep;
-       int     maskoff = aclp->z_ops.ace_mask_off();
-       size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
-       ASSERT(aclnode != NULL);
-
-       acep = (void *)((caddr_t)aclnode->z_acldata +
-           aclnode->z_size - (abstract_size * 6));
-
-       /*
-        * Fixup final ACEs to match the mode
-        */
-
-       adjust_ace_pair_common(acep, maskoff, abstract_size,
-           (mode & 0700) >> 6);        /* owner@ */
-
-       acep = (caddr_t)acep + (abstract_size * 2);
-
-       adjust_ace_pair_common(acep, maskoff, abstract_size,
-           (mode & 0070) >> 3);        /* group@ */
-
-       acep = (caddr_t)acep + (abstract_size * 2);
-       adjust_ace_pair_common(acep, maskoff,
-           abstract_size, mode);       /* everyone@ */
-}
-
-
-static int
-zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny,
-    int entry_type, int accessmask)
-{
-       uint32_t mask = aclp->z_ops.ace_mask_get(acep);
-       uint16_t type = aclp->z_ops.ace_type_get(acep);
-       uint16_t flags = aclp->z_ops.ace_flags_get(acep);
-
-       return (mask == accessmask && type == allow_deny &&
-           ((flags & ACE_TYPE_FLAGS) == entry_type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep)
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
 {
-       int okay_masks;
-       uint16_t prevtype;
-       uint16_t prevflags;
-       uint16_t flags;
-       uint32_t mask, prevmask;
-
-       if (prevacep == NULL)
-               return (B_FALSE);
-
-       prevtype = aclp->z_ops.ace_type_get(prevacep);
-       prevflags = aclp->z_ops.ace_flags_get(prevacep);
-       flags = aclp->z_ops.ace_flags_get(acep);
-       mask = aclp->z_ops.ace_mask_get(acep);
-       prevmask = aclp->z_ops.ace_mask_get(prevacep);
-
-       if (prevtype != DENY)
-               return (B_FALSE);
-
-       if (prevflags != (flags & ACE_IDENTIFIER_GROUP))
-               return (B_FALSE);
-
-       okay_masks = (mask & OKAY_MASK_BITS);
-
-       if (prevmask & ~okay_masks)
-               return (B_FALSE);
-
-       return (B_TRUE);
-}
-
-
-/*
- * Insert new ACL node into chain of zfs_acl_node_t's
- *
- * This will result in two possible results.
- * 1. If the ACL is currently just a single zfs_acl_node and
- *    we are prepending the entry then current acl node will have
- *    a new node inserted above it.
- *
- * 2. If we are inserting in the middle of current acl node then
- *    the current node will be split in two and new node will be inserted
- *    in between the two split nodes.
- */
-static zfs_acl_node_t *
-zfs_acl_ace_insert(zfs_acl_t *aclp, void  *acep)
-{
-       zfs_acl_node_t  *newnode;
-       zfs_acl_node_t  *trailernode = NULL;
-       zfs_acl_node_t  *currnode = zfs_acl_curr_node(aclp);
-       int             curr_idx = aclp->z_curr_node->z_ace_idx;
-       int             trailer_count;
-       size_t          oldsize;
-
-       newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep));
-       newnode->z_ace_count = 1;
-
-       oldsize = currnode->z_size;
-
-       if (curr_idx != 1) {
-               trailernode = zfs_acl_node_alloc(0);
-               trailernode->z_acldata = acep;
-
-               trailer_count = currnode->z_ace_count - curr_idx + 1;
-               currnode->z_ace_count = curr_idx - 1;
-               currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata;
-               trailernode->z_size = oldsize - currnode->z_size;
-               trailernode->z_ace_count = trailer_count;
-       }
-
-       aclp->z_acl_count += 1;
-       aclp->z_acl_bytes += aclp->z_ops.ace_size(acep);
-
-       if (curr_idx == 1)
-               list_insert_before(&aclp->z_acl, currnode, newnode);
-       else
-               list_insert_after(&aclp->z_acl, currnode, newnode);
-       if (trailernode) {
-               list_insert_after(&aclp->z_acl, newnode, trailernode);
-               aclp->z_curr_node = trailernode;
-               trailernode->z_ace_idx = 1;
-       }
-
-       return (newnode);
-}
-
-/*
- * Prepend deny ACE
- */
-static void *
-zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
-    mode_t mode)
-{
-       zfs_acl_node_t *aclnode;
-       void  *newacep;
-       uint64_t fuid;
-       uint16_t flags;
-
-       aclnode = zfs_acl_ace_insert(aclp, acep);
-       newacep = aclnode->z_acldata;
-       fuid = aclp->z_ops.ace_who_get(acep);
-       flags = aclp->z_ops.ace_flags_get(acep);
-       zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
-       zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
-
-       return (newacep);
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
-static void
-zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep)
-{
-       zfs_acl_node_t *aclnode;
-       zfs_acl_node_t *currnode;
-       void  *newacep;
-       uint16_t type, flags;
-       uint32_t mask;
-       uint64_t fuid;
-
-       type = aclp->z_ops.ace_type_get(acep);
-       flags = aclp->z_ops.ace_flags_get(acep);
-       mask = aclp->z_ops.ace_mask_get(acep);
-       fuid = aclp->z_ops.ace_who_get(acep);
-
-       aclnode = zfs_acl_ace_insert(aclp, acep);
-       newacep = aclnode->z_acldata;
-
-       aclp->z_ops.ace_type_set(newacep, type);
-       aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE);
-       aclp->z_ops.ace_mask_set(newacep, mask);
-       aclp->z_ops.ace_type_set(newacep, type);
-       aclp->z_ops.ace_who_set(newacep, fuid);
-       aclp->z_next_ace = acep;
-       flags &= ~ALL_INHERIT;
-       aclp->z_ops.ace_flags_set(acep, flags);
-       currnode = zfs_acl_curr_node(aclp);
-       ASSERT(currnode->z_ace_idx >= 1);
-       currnode->z_ace_idx -= 1;
-}
-
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp)
-{
-       void *acep;
-       zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
-       int             i = 0;
-       size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
-       ASSERT(aclnode != NULL);
-
-       if (aclnode->z_ace_count < 6)
-               return (0);
-
-       acep = (void *)((caddr_t)aclnode->z_acldata +
-           aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6));
-
-       if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-           DENY, ACE_OWNER, 0) &&
-           zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-           ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) &&
-           zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY,
-           OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep +
-           (abstract_size * i++),
-           ALLOW, OWNING_GROUP, 0) &&
-           zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-           DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
-           zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-           ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) {
-               return (1);
-       } else {
-               return (0);
-       }
-}
-
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner.  If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
-    mode_t mode)
-{
-       uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep);
-       uint32_t mask = aclp->z_ops.ace_mask_get(acep);
-       uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep);
-       mode_t extramode = (mode >> 3) & 07;
-       mode_t ownermode = (mode >> 6);
-
-       if (prevflags & ACE_IDENTIFIER_GROUP) {
-
-               extramode &= ~ownermode;
-
-               if (extramode) {
-                       if (extramode & S_IROTH) {
-                               prevmask &= ~ACE_READ_DATA;
-                               mask &= ~ACE_READ_DATA;
-                       }
-                       if (extramode & S_IWOTH) {
-                               prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-                               mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-                       }
-                       if (extramode & S_IXOTH) {
-                               prevmask  &= ~ACE_EXECUTE;
-                               mask &= ~ACE_EXECUTE;
-                       }
-               }
-       }
-       aclp->z_ops.ace_mask_set(acep, mask);
-       aclp->z_ops.ace_mask_set(prevacep, prevmask);
-}
-
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
-static void
-zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
-    uint64_t mode, zfs_acl_t *aclp)
-{
-       void            *acep = NULL, *prevacep = NULL;
+       void            *acep = NULL;
        uint64_t        who;
-       int             i;
+       int             new_count, new_bytes;
+       int             ace_size;
        int             entry_type;
-       int             reuse_deny;
-       int             need_canonical_six = 1;
        uint16_t        iflags, type;
        uint32_t        access_mask;
-
-       /*
-        * If discard then just discard all ACL nodes which
-        * represent the ACEs.
-        *
-        * New owner@/group@/everone@ ACEs will be added
-        * later.
-        */
-       if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
-               zfs_acl_release_nodes(aclp);
+       zfs_acl_node_t  *newnode;
+       size_t          abstract_size = aclp->z_ops.ace_abstract_size();
+       void            *zacep;
+       uint32_t        owner, group, everyone;
+       uint32_t        deny1, deny2, allow0;
+
+       new_count = new_bytes = 0;
+
+       acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2,
+           &owner, &group, &everyone);
+
+       newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+       zacep = newnode->z_acldata;
+       if (allow0) {
+               zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER);
+               zacep = (void *)((uintptr_t)zacep + abstract_size);
+               new_count++;
+               new_bytes += abstract_size;
+       } if (deny1) {
+               zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER);
+               zacep = (void *)((uintptr_t)zacep + abstract_size);
+               new_count++;
+               new_bytes += abstract_size;
+       }
+       if (deny2) {
+               zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP);
+               zacep = (void *)((uintptr_t)zacep + abstract_size);
+               new_count++;
+               new_bytes += abstract_size;
+       }
 
        while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
            &iflags, &type)) {
+               uint16_t inherit_flags;
 
                entry_type = (iflags & ACE_TYPE_FLAGS);
-               iflags = (iflags & ALL_INHERIT);
+               inherit_flags = (iflags & ALL_INHERIT);
+
+               if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+                   (entry_type == OWNING_GROUP)) &&
+                   ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) {
+                       continue;
+               }
 
                if ((type != ALLOW && type != DENY) ||
-                   (iflags & ACE_INHERIT_ONLY_ACE)) {
-                       if (iflags)
+                   (inherit_flags & ACE_INHERIT_ONLY_ACE)) {
+                       if (inherit_flags)
                                aclp->z_hints |= ZFS_INHERIT_ACE;
                        switch (type) {
                        case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@@ -1542,116 +1441,59 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
                                aclp->z_hints |= ZFS_ACL_OBJ_ACE;
                                break;
                        }
-                       goto nextace;
-               }
-
-               /*
-                * Need to split ace into two?
-                */
-               if ((iflags & (ACE_FILE_INHERIT_ACE|
-                   ACE_DIRECTORY_INHERIT_ACE)) &&
-                   (!(iflags & ACE_INHERIT_ONLY_ACE))) {
-                       zfs_acl_split_ace(aclp, acep);
-                       aclp->z_hints |= ZFS_INHERIT_ACE;
-                       goto nextace;
-               }
-
-               if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
-                   (entry_type == OWNING_GROUP)) {
-                       access_mask &= ~OGE_CLEAR;
-                       aclp->z_ops.ace_mask_set(acep, access_mask);
-                       goto nextace;
                } else {
-                       reuse_deny = B_TRUE;
-                       if (type == ALLOW) {
-
-                               /*
-                                * Check preceding ACE if any, to see
-                                * if we need to prepend a DENY ACE.
-                                * This is only applicable when the acl_mode
-                                * property == groupmask.
-                                */
-                               if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
-                                       reuse_deny = zfs_reuse_deny(aclp, acep,
-                                           prevacep);
-
-                                       if (!reuse_deny) {
-                                               prevacep =
-                                                   zfs_acl_prepend_deny(uid,
-                                                   aclp, acep, mode);
-                                       } else {
-                                               zfs_acl_prepend_fixup(
-                                                   aclp, prevacep,
-                                                   acep, mode, uid);
-                                       }
-                                       zfs_fixup_group_entries(aclp, acep,
-                                           prevacep, mode);
-                               }
-                       }
-               }
-nextace:
-               prevacep = acep;
-       }
-
-       /*
-        * Check out last six aces, if we have six.
-        */
 
-       if (aclp->z_acl_count >= 6) {
-               if (zfs_have_canonical_six(aclp)) {
-                       need_canonical_six = 0;
+                       /*
+                        * Limit permissions to be no greater than
+                        * group permissions
+                        */
+                       if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) {
+                               if (!(mode & S_IRGRP))
+                                       access_mask &= ~ACE_READ_DATA;
+                               if (!(mode & S_IWGRP))
+                                       access_mask &=
+                                           ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+                               if (!(mode & S_IXGRP))
+                                       access_mask &= ~ACE_EXECUTE;
+                               access_mask &=
+                                   ~(ACE_WRITE_OWNER|ACE_WRITE_ACL|
+                                   ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS);
+                       }
                }
-       }
-
-       if (need_canonical_six) {
-               size_t abstract_size = aclp->z_ops.ace_abstract_size();
-               void *zacep;
-               zfs_acl_node_t *aclnode =
-                   zfs_acl_node_alloc(abstract_size * 6);
-
-               aclnode->z_size = abstract_size * 6;
-               aclnode->z_ace_count = 6;
-               aclp->z_acl_bytes += aclnode->z_size;
-               list_insert_tail(&aclp->z_acl, aclnode);
-
-               zacep = aclnode->z_acldata;
-
-               i = 0;
-               zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-                   0, DENY, -1, ACE_OWNER);
-               zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-                   OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
-               zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
-                   DENY, -1, OWNING_GROUP);
-               zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
-                   ALLOW, -1, OWNING_GROUP);
-               zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-                   EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE);
-               zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-                   EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE);
-               aclp->z_acl_count += 6;
-       }
-
-       zfs_acl_fixup_canonical_six(aclp, mode);
+               zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+               ace_size = aclp->z_ops.ace_size(acep);
+               zacep = (void *)((uintptr_t)zacep + ace_size);
+               new_count++;
+               new_bytes += ace_size;
+       }
+       zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER);
+       zacep = (void *)((uintptr_t)zacep + abstract_size);
+       zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP);
+       zacep = (void *)((uintptr_t)zacep + abstract_size);
+       zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE);
+
+       new_count += 3;
+       new_bytes += abstract_size * 3;
+       zfs_acl_release_nodes(aclp);
+       aclp->z_acl_count = new_count;
+       aclp->z_acl_bytes = new_bytes;
+       newnode->z_ace_count = new_count;
+       newnode->z_size = new_bytes;
+       list_insert_tail(&aclp->z_acl, newnode);
 }
 
 int
 zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 {
-       int error;
-
        mutex_enter(&zp->z_lock);
        mutex_enter(&zp->z_acl_lock);
-       *aclp = NULL;
-       error = zfs_acl_node_read(zp, aclp, B_TRUE);
-       if (error == 0) {
-               (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
-               zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
-       }
+       *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+       (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+       zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp);
        mutex_exit(&zp->z_acl_lock);
        mutex_exit(&zp->z_lock);
-       return (error);
+       ASSERT(*aclp);
+       return (0);
 }
 
 /*
@@ -1693,8 +1535,8 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
     uint64_t mode, boolean_t *need_chmod)
 {
        void            *pacep;
-       void            *acep, *acep2;
-       zfs_acl_node_t  *aclnode, *aclnode2;
+       void            *acep;
+       zfs_acl_node_t  *aclnode;
        zfs_acl_t       *aclp = NULL;
        uint64_t        who;
        uint32_t        access_mask;
@@ -1716,7 +1558,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
        *need_chmod = B_TRUE;
        pacep = NULL;
        aclp = zfs_acl_alloc(paclp->z_version);
-       if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD)
+       if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || vtype == VLNK)
                return (aclp);
        while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
            &access_mask, &iflags, &type)) {
@@ -1745,11 +1587,11 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
                    OWNING_GROUP)) && (vreg || (vdir && (iflags &
                    ACE_DIRECTORY_INHERIT_ACE)))) {
                        *need_chmod = B_FALSE;
+               }
 
-                       if (!vdir && passthrough_x &&
-                           ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
-                               access_mask &= ~ACE_EXECUTE;
-                       }
+               if (!vdir && passthrough_x &&
+                   ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
+                       access_mask &= ~ACE_EXECUTE;
                }
 
                aclnode = zfs_acl_node_alloc(ace_size);
@@ -1767,6 +1609,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
                            &data2)) == data1sz);
                        bcopy(data1, data2, data2sz);
                }
+
                aclp->z_acl_count++;
                aclnode->z_ace_count++;
                aclp->z_acl_bytes += aclnode->z_size;
@@ -1785,38 +1628,17 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
 
                ASSERT(vdir);
 
-               newflags = aclp->z_ops.ace_flags_get(acep);
+               /*
+                * If only FILE_INHERIT is set then turn on
+                * inherit_only
+                */
                if ((iflags & (ACE_FILE_INHERIT_ACE |
-                   ACE_DIRECTORY_INHERIT_ACE)) !=
-                   ACE_FILE_INHERIT_ACE) {
-                       aclnode2 = zfs_acl_node_alloc(ace_size);
-                       list_insert_tail(&aclp->z_acl, aclnode2);
-                       acep2 = aclnode2->z_acldata;
-                       zfs_set_ace(aclp, acep2,
-                           access_mask, type, who,
-                           iflags|ACE_INHERITED_ACE);
+                   ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
                        newflags |= ACE_INHERIT_ONLY_ACE;
-                       aclp->z_ops.ace_flags_set(acep, newflags);
-                       newflags &= ~ALL_INHERIT;
-                       aclp->z_ops.ace_flags_set(acep2,
+                       aclp->z_ops.ace_flags_set(acep,
                            newflags|ACE_INHERITED_ACE);
-
-                       /*
-                        * Copy special opaque data if any
-                        */
-                       if ((data1sz = aclp->z_ops.ace_data(acep,
-                           &data1)) != 0) {
-                               VERIFY((data2sz =
-                                   aclp->z_ops.ace_data(acep2,
-                                   &data2)) == data1sz);
-                               bcopy(data1, data2, data1sz);
-                       }
-                       aclp->z_acl_count++;
-                       aclnode2->z_ace_count++;
-                       aclp->z_acl_bytes += aclnode->z_size;
-                       zfs_restricted_update(zfsvfs, aclp, acep2);
                } else {
-                       newflags |= ACE_INHERIT_ONLY_ACE;
+                       newflags &= ~ACE_INHERIT_ONLY_ACE;
                        aclp->z_ops.ace_flags_set(acep,
                            newflags|ACE_INHERITED_ACE);
                }
@@ -1837,6 +1659,8 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
        zfs_acl_t       *paclp;
        gid_t           gid;
        boolean_t       need_chmod = B_TRUE;
+       boolean_t       inherited = B_FALSE;
+       uint64_t        parentgid;
 
        bzero(acl_ids, sizeof (zfs_acl_ids_t));
        acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1845,11 +1669,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
                    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
                        return (error);
-
        /*
         * Determine uid and gid.
         */
-       if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+       if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
            ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
                acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
                    (uint64_t)vap->va_uid, cr,
@@ -1859,6 +1682,12 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                    ZFS_GROUP, &acl_ids->z_fuidp);
                gid = vap->va_gid;
        } else {
+               if (IS_EPHEMERAL(dzp->z_gid))
+                       VERIFY(0 == sa_lookup(dzp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
+                           &parentgid, sizeof (parentgid)));
+               else
+                       parentgid = (uint64_t)dzp->z_gid;
+
                acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
                    cr, &acl_ids->z_fuidp);
                acl_ids->z_fgid = 0;
@@ -1867,16 +1696,31 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                            (uint64_t)vap->va_gid,
                            cr, ZFS_GROUP, &acl_ids->z_fuidp);
                        gid = vap->va_gid;
-                       if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
+                       if (acl_ids->z_fgid != parentgid &&
                            !groupmember(vap->va_gid, cr) &&
                            secpolicy_vnode_create_gid(cr) != 0)
                                acl_ids->z_fgid = 0;
                }
                if (acl_ids->z_fgid == 0) {
-                       if (dzp->z_phys->zp_mode & S_ISGID) {
-                               acl_ids->z_fgid = dzp->z_phys->zp_gid;
+                       if (dzp->z_mode & S_ISGID) {
+                               char            *domain;
+                               uint32_t        rid;
+
+                               acl_ids->z_fgid = parentgid;
                                gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
                                    cr, ZFS_GROUP);
+
+                               if (zfsvfs->z_use_fuids &&
+                                   IS_EPHEMERAL(acl_ids->z_fgid)) {
+                                       domain = zfs_fuid_idx_domain(
+                                           &zfsvfs->z_fuid_idx,
+                                           FUID_INDEX(acl_ids->z_fgid));
+                                       rid = FUID_RID(acl_ids->z_fgid);
+                                       zfs_fuid_node_add(&acl_ids->z_fuidp,
+                                           domain, rid,
+                                           FUID_INDEX(acl_ids->z_fgid),
+                                           acl_ids->z_fgid, ZFS_GROUP);
+                               }
                        } else {
                                acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
                                    ZFS_GROUP, cr, &acl_ids->z_fuidp);
@@ -1892,7 +1736,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
         * file's new group, clear the file's set-GID bit.
         */
 
-       if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+       if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
            (vap->va_type == VDIR)) {
                acl_ids->z_mode |= S_ISGID;
        } else {
@@ -1904,26 +1748,35 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
        if (acl_ids->z_aclp == NULL) {
                mutex_enter(&dzp->z_lock);
                if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
-                   (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
-                   !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
+                   (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
+                   !(dzp->z_pflags & ZFS_XATTR)) {
                        mutex_enter(&dzp->z_acl_lock);
                        VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
-                       mutex_exit(&dzp->z_acl_lock);
                        acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
                            vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+                       mutex_exit(&dzp->z_acl_lock);
+                       inherited = B_TRUE;
                } else {
                        acl_ids->z_aclp =
                            zfs_acl_alloc(zfs_acl_version_zp(dzp));
+                       acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
                }
                mutex_exit(&dzp->z_lock);
                if (need_chmod) {
-                       acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+                       acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
                            ZFS_ACL_AUTO_INHERIT : 0;
-                       zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
-                           acl_ids->z_mode, acl_ids->z_aclp);
+                       zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp);
                }
        }
 
+       if (inherited || vsecp) {
+               acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+                   acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+                   acl_ids->z_fuid, acl_ids->z_fgid);
+               if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+                       acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+       }
+
        return (0);
 }
 
@@ -1944,8 +1797,8 @@ zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
 boolean_t
 zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
 {
-       return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
-           zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
+       return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+           zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
 }
 
 /*
@@ -1963,12 +1816,12 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
        mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
            VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
 
-       if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
-               return (error);
-
        if (mask == 0)
                return (ENOSYS);
 
+       if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
+               return (error);
+
        mutex_enter(&zp->z_acl_lock);
 
        error = zfs_acl_node_read(zp, &aclp, B_FALSE);
@@ -1980,8 +1833,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
        /*
         * Scan ACL to determine number of ACEs
         */
-       if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
-           !(mask & VSA_ACE_ALLTYPES)) {
+       if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
                void *zacep = NULL;
                uint64_t who;
                uint32_t access_mask;
@@ -2002,7 +1854,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
                }
                vsecp->vsa_aclcnt = count;
        } else
-               count = aclp->z_acl_count;
+               count = (int)aclp->z_acl_count;
 
        if (mask & VSA_ACECNT) {
                vsecp->vsa_aclcnt = count;
@@ -2011,8 +1863,6 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
        if (mask & VSA_ACE) {
                size_t aclsz;
 
-               zfs_acl_node_t *aclnode = list_head(&aclp->z_acl);
-
                aclsz = count * sizeof (ace_t) +
                    sizeof (ace_object_t) * largeace;
 
@@ -2023,17 +1873,26 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
                        zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
                            vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
                else {
-                       bcopy(aclnode->z_acldata, vsecp->vsa_aclentp,
-                           count * sizeof (ace_t));
+                       zfs_acl_node_t *aclnode;
+                       void *start = vsecp->vsa_aclentp;
+
+                       for (aclnode = list_head(&aclp->z_acl); aclnode;
+                           aclnode = list_next(&aclp->z_acl, aclnode)) {
+                               bcopy(aclnode->z_acldata, start,
+                                   aclnode->z_size);
+                               start = (caddr_t)start + aclnode->z_size;
+                       }
+                       ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+                           aclp->z_acl_bytes);
                }
        }
        if (mask & VSA_ACE_ACLFLAGS) {
                vsecp->vsa_aclflags = 0;
-               if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
+               if (zp->z_pflags & ZFS_ACL_DEFAULTED)
                        vsecp->vsa_aclflags |= ACL_DEFAULTED;
-               if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
+               if (zp->z_pflags & ZFS_ACL_PROTECTED)
                        vsecp->vsa_aclflags |= ACL_PROTECTED;
-               if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
+               if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
                        vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
        }
 
@@ -2115,7 +1974,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
        if (mask == 0)
                return (ENOSYS);
 
-       if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
+       if (zp->z_pflags & ZFS_IMMUTABLE)
                return (EPERM);
 
        if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
@@ -2131,37 +1990,40 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
         * existing flags.
         */
        if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
-               aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
+               aclp->z_hints |=
+                   (zp->z_pflags & V4_ACL_WIDE_FLAGS);
        }
 top:
        mutex_enter(&zp->z_lock);
        mutex_enter(&zp->z_acl_lock);
 
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, zp->z_id);
-
-       if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-               /* Are we upgrading ACL? */
-               if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
-                   zp->z_phys->zp_acl.z_acl_version ==
-                   ZFS_ACL_VERSION_INITIAL) {
-                       dmu_tx_hold_free(tx,
-                           zp->z_phys->zp_acl.z_acl_extern_obj,
-                           0, DMU_OBJECT_END);
-                       dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-                           0, aclp->z_acl_bytes);
+
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+       fuid_dirtied = zfsvfs->z_fuid_dirty;
+       if (fuid_dirtied)
+               zfs_fuid_txhold(zfsvfs, tx);
+
+       /*
+        * If old version and ACL won't fit in bonus and we aren't
+        * upgrading then take out necessary DMU holds
+        */
+
+       if (ZFS_EXTERNAL_ACL(zp)) {
+               if (zfsvfs->z_version <= ZPL_VERSION_SA &&
+                   ZNODE_ACL_VERSION(zp) <= ZFS_ACL_VERSION_INITIAL) {
+                       dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+                           DMU_OBJECT_END);
                } else {
-                       dmu_tx_hold_write(tx,
-                           zp->z_phys->zp_acl.z_acl_extern_obj,
+                       dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp),
                            0, aclp->z_acl_bytes);
                }
-       } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+       } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
                dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
        }
-       fuid_dirtied = zfsvfs->z_fuid_dirty;
-       if (fuid_dirtied)
-               zfs_fuid_txhold(zfsvfs, tx);
 
+       zfs_sa_upgrade_txholds(tx, zp);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
                mutex_exit(&zp->z_acl_lock);
@@ -2184,7 +2046,6 @@ top:
        if (fuid_dirtied)
                zfs_fuid_sync(zfsvfs, tx);
 
-       zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
        zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
 
        if (fuidp)
@@ -2217,19 +2078,19 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
         */
        if ((v4_mode & WRITE_MASK_DATA) &&
            (((ZTOV(zp)->v_type != VDIR) &&
-           (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+           (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
            (ZTOV(zp)->v_type == VDIR &&
-           (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
+           (zp->z_pflags & ZFS_IMMUTABLE)))) {
                return (EPERM);
        }
 
        if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
-           (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+           (zp->z_pflags & ZFS_NOUNLINK)) {
                return (EPERM);
        }
 
        if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
-           (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
+           (zp->z_pflags & ZFS_AV_QUARANTINED))) {
                return (EACCES);
        }
 
@@ -2276,10 +2137,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
        uint32_t        deny_mask = 0;
        zfs_ace_hdr_t   *acep = NULL;
        boolean_t       checkit;
-       uid_t           fowner;
-       uid_t           gowner;
-
-       zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+       uint64_t        gowner;
 
        mutex_enter(&zp->z_acl_lock);
 
@@ -2289,6 +2147,14 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
                return (error);
        }
 
+       ASSERT(zp->z_acl_cached);
+
+       if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
+           &gowner, sizeof (gowner))) != 0) {
+               mutex_exit(&zp->z_acl_lock);
+               return (error);
+       }
+
        while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
            &iflags, &type)) {
                uint32_t mask_matched;
@@ -2310,7 +2176,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
 
                switch (entry_type) {
                case ACE_OWNER:
-                       if (uid == fowner)
+                       if (uid == zp->z_uid)
                                checkit = B_TRUE;
                        break;
                case OWNING_GROUP:
@@ -2388,18 +2254,8 @@ zfs_has_access(znode_t *zp, cred_t *cr)
        uint32_t have = ACE_ALL_PERMS;
 
        if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
-               uid_t           owner;
-
-               owner = zfs_fuid_map_id(zp->z_zfsvfs,
-                   zp->z_phys->zp_uid, cr, ZFS_OWNER);
-
-               return (
-                   secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
-                   secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
-                   secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
-                   secpolicy_vnode_chown(cr, owner) == 0 ||
-                   secpolicy_vnode_setdac(cr, owner) == 0 ||
-                   secpolicy_vnode_remove(cr) == 0);
+               return (secpolicy_vnode_any_access(cr, ZTOV(zp),
+                   zp->z_uid) == 0);
        }
        return (B_TRUE);
 }
@@ -2457,38 +2313,33 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
        boolean_t owner = B_FALSE;
        boolean_t groupmbr = B_FALSE;
        boolean_t is_attr;
-       uid_t fowner;
-       uid_t gowner;
        uid_t uid = crgetuid(cr);
        int error;
 
-       if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+       if (zdp->z_pflags & ZFS_AV_QUARANTINED)
                return (EACCES);
 
-       is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+       is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
            (ZTOV(zdp)->v_type == VDIR));
        if (is_attr)
                goto slow;
 
+
        mutex_enter(&zdp->z_acl_lock);
 
-       if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+       if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
                mutex_exit(&zdp->z_acl_lock);
                return (0);
        }
 
-       if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
-           FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+       if (IS_EPHEMERAL(zdp->z_uid) != 0 || IS_EPHEMERAL(zdp->z_gid) != 0) {
                mutex_exit(&zdp->z_acl_lock);
                goto slow;
        }
 
-       fowner = (uid_t)zdp->z_phys->zp_uid;
-       gowner = (uid_t)zdp->z_phys->zp_gid;
-
-       if (uid == fowner) {
+       if (uid == zdp->z_uid) {
                owner = B_TRUE;
-               if (zdp->z_phys->zp_mode & S_IXUSR) {
+               if (zdp->z_mode & S_IXUSR) {
                        mutex_exit(&zdp->z_acl_lock);
                        return (0);
                } else {
@@ -2496,9 +2347,9 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
                        goto slow;
                }
        }
-       if (groupmember(gowner, cr)) {
+       if (groupmember(zdp->z_gid, cr)) {
                groupmbr = B_TRUE;
-               if (zdp->z_phys->zp_mode & S_IXGRP) {
+               if (zdp->z_mode & S_IXGRP) {
                        mutex_exit(&zdp->z_acl_lock);
                        return (0);
                } else {
@@ -2507,7 +2358,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
                }
        }
        if (!owner && !groupmbr) {
-               if (zdp->z_phys->zp_mode & S_IXOTH) {
+               if (zdp->z_mode & S_IXOTH) {
                        mutex_exit(&zdp->z_acl_lock);
                        return (0);
                }
@@ -2524,8 +2375,9 @@ slow:
 }
 
 /*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
+ * Determine whether Access should be granted/denied.
+ * The least priv subsytem is always consulted as a basic privilege
+ * can define any form of access.
  */
 int
 zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
@@ -2533,20 +2385,26 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
        uint32_t        working_mode;
        int             error;
        int             is_attr;
-       zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
        boolean_t       check_privs;
        znode_t         *xzp;
        znode_t         *check_zp = zp;
+       mode_t          needed_bits;
 
-       is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
-           (ZTOV(zp)->v_type == VDIR));
+       is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
 
        /*
         * If attribute then validate against base file
         */
        if (is_attr) {
+               uint64_t        parent;
+
+               if ((error = sa_lookup(zp->z_sa_hdl,
+                   SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+                   sizeof (parent))) != 0)
+                       return (error);
+
                if ((error = zfs_zget(zp->z_zfsvfs,
-                   zp->z_phys->zp_parent, &xzp)) != 0) {
+                   parent, &xzp)) != 0)        {
                        return (error);
                }
 
@@ -2567,11 +2425,35 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                }
        }
 
+       /*
+        * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+        * in needed_bits.  Map the bits mapped by working_mode (currently
+        * missing) in missing_bits.
+        * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+        * needed_bits.
+        */
+       needed_bits = 0;
+
+       working_mode = mode;
+       if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+           zp->z_uid == crgetuid(cr))
+               working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+       if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+           ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+               needed_bits |= VREAD;
+       if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+           ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+               needed_bits |= VWRITE;
+       if (working_mode & ACE_EXECUTE)
+               needed_bits |= VEXEC;
+
        if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
            &check_privs, skipaclchk, cr)) == 0) {
                if (is_attr)
                        VN_RELE(ZTOV(xzp));
-               return (0);
+               return (secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid,
+                   needed_bits, needed_bits));
        }
 
        if (error && !check_privs) {
@@ -2585,12 +2467,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
        }
 
        if (error && check_privs) {
-               uid_t           owner;
                mode_t          checkmode = 0;
 
-               owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
-                   ZFS_OWNER);
-
                /*
                 * First check for implicit owner permission on
                 * read_acl/read_attributes
@@ -2600,7 +2478,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                ASSERT(working_mode != 0);
 
                if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
-                   owner == crgetuid(cr)))
+                   zp->z_uid == crgetuid(cr)))
                        working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
 
                if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
@@ -2612,21 +2490,20 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                if (working_mode & ACE_EXECUTE)
                        checkmode |= VEXEC;
 
-               if (checkmode)
-                       error = secpolicy_vnode_access(cr, ZTOV(check_zp),
-                           owner, checkmode);
+               error = secpolicy_vnode_access2(cr, ZTOV(check_zp), zp->z_uid,
+                   needed_bits & ~checkmode, needed_bits);
 
                if (error == 0 && (working_mode & ACE_WRITE_OWNER))
-                       error = secpolicy_vnode_chown(cr, owner);
+                       error = secpolicy_vnode_chown(cr, zp->z_uid);
                if (error == 0 && (working_mode & ACE_WRITE_ACL))
-                       error = secpolicy_vnode_setdac(cr, owner);
+                       error = secpolicy_vnode_setdac(cr, zp->z_uid);
 
                if (error == 0 && (working_mode &
                    (ACE_DELETE|ACE_DELETE_CHILD)))
                        error = secpolicy_vnode_remove(cr);
 
                if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
-                       error = secpolicy_vnode_chown(cr, owner);
+                       error = secpolicy_vnode_chown(cr, zp->z_uid);
                }
                if (error == 0) {
                        /*
@@ -2637,8 +2514,12 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                                error = EACCES;
                        }
                }
+       } else if (error == 0) {
+               error = secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid,
+                   needed_bits, needed_bits);
        }
 
+
        if (is_attr)
                VN_RELE(ZTOV(xzp));
 
@@ -2668,15 +2549,12 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
 
 static int
 zfs_delete_final_check(znode_t *zp, znode_t *dzp,
-    mode_t missing_perms, cred_t *cr)
+    mode_t available_perms, cred_t *cr)
 {
        int error;
-       uid_t downer;
-       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-       downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
 
-       error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
+       error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+           dzp->z_uid, available_perms, VWRITE|VEXEC);
 
        if (error == 0)
                error = zfs_sticky_remove_access(dzp, zp, cr);
@@ -2725,7 +2603,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
        uint32_t dzp_working_mode = 0;
        uint32_t zp_working_mode = 0;
        int dzp_error, zp_error;
-       mode_t missing_perms;
+       mode_t available_perms;
        boolean_t dzpcheck_privs = B_TRUE;
        boolean_t zpcheck_privs = B_TRUE;
 
@@ -2743,7 +2621,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
         * to determine what was found.
         */
 
-       if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+       if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
                return (EPERM);
 
        /*
@@ -2786,23 +2664,20 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
         * only need to see if we have write/execute on directory.
         */
 
-       if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
-           &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
-               return (zfs_sticky_remove_access(dzp, zp, cr));
+       dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+           &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
 
-       if (!dzpcheck_privs)
+       if (dzp_error != 0 && !dzpcheck_privs)
                return (dzp_error);
 
        /*
         * Fourth row
         */
 
-       missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0;
-       missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0;
-
-       ASSERT(missing_perms);
+       available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+       available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
 
-       return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
+       return (zfs_delete_final_check(zp, dzp, available_perms, cr));
 
 }
 
@@ -2813,7 +2688,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
        int add_perm;
        int error;
 
-       if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+       if (szp->z_pflags & ZFS_AV_QUARANTINED)
                return (EACCES);
 
        add_perm = (ZTOV(szp)->v_type == VDIR) ?
index cd36696..acf632b 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,6 +27,7 @@
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
 #include <sys/zfs_acl.h>
 
 void
index c6c7198..362de4d 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -186,6 +185,17 @@ zfsctl_fini(void)
        zfsctl_ops_shares_dir = NULL;
 }
 
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+       return (vn_matchops(vp, zfsctl_ops_root) ||
+           vn_matchops(vp, zfsctl_ops_snapdir) ||
+           vn_matchops(vp, zfsctl_ops_snapshot) ||
+           vn_matchops(vp, zfsctl_ops_shares) ||
+           vn_matchops(vp, zfsctl_ops_shares_dir));
+
+}
+
 /*
  * Return the inode number associated with the 'snapshot' or
  * 'shares' directory.
@@ -215,6 +225,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
 {
        vnode_t *vp, *rvp;
        zfsctl_node_t *zcp;
+       uint64_t crtime[2];
 
        ASSERT(zfsvfs->z_ctldir == NULL);
 
@@ -225,7 +236,9 @@ zfsctl_create(zfsvfs_t *zfsvfs)
        zcp->zc_id = ZFSCTL_INO_ROOT;
 
        VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
-       ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
+       VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+           &crtime, sizeof (crtime)));
+       ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
        VN_RELE(rvp);
 
        /*
@@ -311,14 +324,13 @@ zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
-       zfsctl_node_t   *zcp = vp->v_data;
        timestruc_t     now;
 
        vap->va_uid = 0;
        vap->va_gid = 0;
        vap->va_rdev = 0;
        /*
-        * We are a purly virtual object, so we have no
+        * We are a purely virtual object, so we have no
         * blocksize or allocated blocks.
         */
        vap->va_blksize = 0;
@@ -333,7 +345,6 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
         */
        gethrestime(&now);
        vap->va_atime = now;
-       vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
 }
 
 /*ARGSUSED*/
@@ -416,10 +427,12 @@ zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
        zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+       zfsctl_node_t *zcp = vp->v_data;
 
        ZFS_ENTER(zfsvfs);
        vap->va_nodeid = ZFSCTL_INO_ROOT;
        vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+       vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
 
        zfsctl_common_getattr(vp, vap);
        ZFS_EXIT(zfsvfs);
@@ -853,8 +866,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
                 */
                return (err == EILSEQ ? ENOENT : err);
        }
-       if (dmu_objset_open(snapname, DMU_OST_ZFS,
-           DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
+       if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
                mutex_exit(&sdp->sd_lock);
                ZFS_EXIT(zfsvfs);
                return (ENOENT);
@@ -866,7 +878,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
        *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
        avl_insert(&sdp->sd_snaps, sep, where);
 
-       dmu_objset_close(snap);
+       dmu_objset_rele(snap, FTAG);
 domount:
        mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
            strlen("/.zfs/snapshot/") + strlen(nm) + 1;
@@ -1102,6 +1114,7 @@ zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        zfsctl_common_getattr(vp, vap);
        vap->va_nodeid = gfs_file_inode(vp);
        vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+       vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
        ZFS_EXIT(zfsvfs);
 
        return (0);
diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c
new file mode 100644 (file)
index 0000000..d0f411a
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+       list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+           offsetof(zfs_dbgmsg_t, zdm_node));
+       mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+       zfs_dbgmsg_t *zdm;
+
+       while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+               int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+               kmem_free(zdm, size);
+               zfs_dbgmsg_size -= size;
+       }
+       mutex_destroy(&zfs_dbgmsgs_lock);
+       ASSERT3U(zfs_dbgmsg_size, ==, 0);
+}
+
+/*
+ * Print these messages by running:
+ *     echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ *     dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+       int size;
+       va_list adx;
+       zfs_dbgmsg_t *zdm;
+
+       va_start(adx, fmt);
+       size = vsnprintf(NULL, 0, fmt, adx);
+       va_end(adx);
+
+       /*
+        * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+        * for the terminating null.
+        */
+       zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+       zdm->zdm_timestamp = gethrestime_sec();
+
+       va_start(adx, fmt);
+       (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+       va_end(adx);
+
+       DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+       mutex_enter(&zfs_dbgmsgs_lock);
+       list_insert_tail(&zfs_dbgmsgs, zdm);
+       zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+       while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+               zdm = list_remove_head(&zfs_dbgmsgs);
+               size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+               kmem_free(zdm, size);
+               zfs_dbgmsg_size -= size;
+       }
+       mutex_exit(&zfs_dbgmsgs_lock);
+}
index b3f7683..6d66668 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -52,6 +51,8 @@
 #include <sys/atomic.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/extdirent.h>
 
@@ -114,6 +115,8 @@ zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
  *               ZCIEXACT: On a purely case-insensitive file system,
  *                         this lookup should be case-sensitive.
  *               ZRENAMING: we are locking for renaming, force narrow locks
+ *               ZHAVELOCK: Don't grab the z_name_lock for this call. The
+ *                          current thread already holds it.
  *
  * Output arguments:
  *     zpp     - pointer to the znode for the entry (NULL if there isn't one)
@@ -208,13 +211,20 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 
        /*
         * Wait until there are no locks on this name.
+        *
+        * Don't grab the the lock if it is already held. However, cannot
+        * have both ZSHARED and ZHAVELOCK together.
         */
-       rw_enter(&dzp->z_name_lock, RW_READER);
+       ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
+       if (!(flag & ZHAVELOCK))
+               rw_enter(&dzp->z_name_lock, RW_READER);
+
        mutex_enter(&dzp->z_lock);
        for (;;) {
                if (dzp->z_unlinked) {
                        mutex_exit(&dzp->z_lock);
-                       rw_exit(&dzp->z_name_lock);
+                       if (!(flag & ZHAVELOCK))
+                               rw_exit(&dzp->z_name_lock);
                        return (ENOENT);
                }
                for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
@@ -224,7 +234,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
                }
                if (error != 0) {
                        mutex_exit(&dzp->z_lock);
-                       rw_exit(&dzp->z_name_lock);
+                       if (!(flag & ZHAVELOCK))
+                               rw_exit(&dzp->z_name_lock);
                        return (ENOENT);
                }
                if (dl == NULL) {
@@ -235,6 +246,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
                        cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
                        dl->dl_name = name;
                        dl->dl_sharecnt = 0;
+                       dl->dl_namelock = 0;
                        dl->dl_namesize = 0;
                        dl->dl_dzp = dzp;
                        dl->dl_next = dzp->z_dirlocks;
@@ -246,6 +258,12 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
                cv_wait(&dl->dl_cv, &dzp->z_lock);
        }
 
+       /*
+        * If the z_name_lock was NOT held for this dirlock record it.
+        */
+       if (flag & ZHAVELOCK)
+               dl->dl_namelock = 1;
+
        if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
                /*
                 * We're the second shared reference to dl.  Make a copy of
@@ -269,8 +287,10 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
         * See if there's an object by this name; if so, put a hold on it.
         */
        if (flag & ZXATTR) {
-               zoid = dzp->z_phys->zp_xattr;
-               error = (zoid == 0 ? ENOENT : 0);
+               error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+                   sizeof (zoid));
+               if (error == 0)
+                       error = (zoid == 0 ? ENOENT : 0);
        } else {
                if (update)
                        vp = dnlc_lookup(ZTOV(dzp), name);
@@ -325,7 +345,10 @@ zfs_dirent_unlock(zfs_dirlock_t *dl)
        zfs_dirlock_t **prev_dl, *cur_dl;
 
        mutex_enter(&dzp->z_lock);
-       rw_exit(&dzp->z_name_lock);
+
+       if (!dl->dl_namelock)
+               rw_exit(&dzp->z_name_lock);
+
        if (dl->dl_sharecnt > 1) {
                dl->dl_sharecnt--;
                mutex_exit(&dzp->z_lock);
@@ -359,25 +382,29 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
        zfs_dirlock_t *dl;
        znode_t *zp;
        int error = 0;
+       uint64_t parent;
 
        if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
                *vpp = ZTOV(dzp);
                VN_HOLD(*vpp);
        } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
                zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+
                /*
                 * If we are a snapshot mounted under .zfs, return
                 * the vp for the snapshot directory.
                 */
-               if (dzp->z_phys->zp_parent == dzp->z_id &&
-                   zfsvfs->z_parent != zfsvfs) {
+               if ((error = sa_lookup(dzp->z_sa_hdl,
+                   SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+                       return (error);
+               if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
                        error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
                            "snapshot", vpp, NULL, 0, NULL, kcred,
                            NULL, NULL, NULL);
                        return (error);
                }
                rw_enter(&dzp->z_parent_lock, RW_READER);
-               error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+               error = zfs_zget(zfsvfs, parent, &zp);
                if (error == 0)
                        *vpp = ZTOV(zp);
                rw_exit(&dzp->z_parent_lock);
@@ -425,7 +452,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
        ASSERT(zp->z_unlinked);
-       ASSERT3U(zp->z_phys->zp_links, ==, 0);
+       ASSERT(zp->z_links == 0);
 
        VERIFY3U(0, ==,
            zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -520,10 +547,12 @@ zfs_purgedir(znode_t *dzp)
                    (ZTOV(xzp)->v_type == VLNK));
 
                tx = dmu_tx_create(zfsvfs->z_os);
-               dmu_tx_hold_bonus(tx, dzp->z_id);
+               dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
                dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
-               dmu_tx_hold_bonus(tx, xzp->z_id);
+               dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
                dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+               /* Is this really needed ? */
+               zfs_sa_upgrade_txholds(tx, xzp);
                error = dmu_tx_assign(tx, TXG_WAIT);
                if (error) {
                        dmu_tx_abort(tx);
@@ -556,15 +585,16 @@ zfs_rmnode(znode_t *zp)
        znode_t         *xzp = NULL;
        dmu_tx_t        *tx;
        uint64_t        acl_obj;
+       uint64_t        xattr_obj;
        int             error;
 
+       ASSERT(zp->z_links == 0);
        ASSERT(ZTOV(zp)->v_count == 0);
-       ASSERT(zp->z_phys->zp_links == 0);
 
        /*
         * If this is an attribute directory, purge its contents.
         */
-       if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
+       if (ZTOV(zp)->v_type == VDIR && (zp->z_pflags & ZFS_XATTR)) {
                if (zfs_purgedir(zp) != 0) {
                        /*
                         * Not enough space to delete some xattrs.
@@ -593,12 +623,14 @@ zfs_rmnode(znode_t *zp)
         * If the file has extended attributes, we're going to unlink
         * the xattr dir.
         */
-       if (zp->z_phys->zp_xattr) {
-               error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+       error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+           &xattr_obj, sizeof (xattr_obj));
+       if (error == 0 && xattr_obj) {
+               error = zfs_zget(zfsvfs, xattr_obj, &xzp);
                ASSERT(error == 0);
        }
 
-       acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+       acl_obj = ZFS_EXTERNAL_ACL(zp);
 
        /*
         * Set up the final transaction.
@@ -607,11 +639,13 @@ zfs_rmnode(znode_t *zp)
        dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
        dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
        if (xzp) {
-               dmu_tx_hold_bonus(tx, xzp->z_id);
                dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+               dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
        }
        if (acl_obj)
                dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+       zfs_sa_upgrade_txholds(tx, zp);
        error = dmu_tx_assign(tx, TXG_WAIT);
        if (error) {
                /*
@@ -626,10 +660,12 @@ zfs_rmnode(znode_t *zp)
        }
 
        if (xzp) {
-               dmu_buf_will_dirty(xzp->z_dbuf, tx);
+               ASSERT(error == 0);
                mutex_enter(&xzp->z_lock);
                xzp->z_unlinked = B_TRUE;       /* mark xzp for deletion */
-               xzp->z_phys->zp_links = 0;      /* no more links to it */
+               xzp->z_links = 0;       /* no more links to it */
+               VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+                   &xzp->z_links, sizeof (xzp->z_links), tx));
                mutex_exit(&xzp->z_lock);
                zfs_unlinked_add(xzp, tx);
        }
@@ -647,11 +683,12 @@ out:
 }
 
 static uint64_t
-zfs_dirent(znode_t *zp)
+zfs_dirent(znode_t *zp, uint64_t mode)
 {
        uint64_t de = zp->z_id;
+
        if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
-               de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+               de |= IFTODT(mode) << 60;
        return (de);
 }
 
@@ -662,12 +699,15 @@ int
 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 {
        znode_t *dzp = dl->dl_dzp;
+       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        vnode_t *vp = ZTOV(zp);
        uint64_t value;
        int zp_is_dir = (vp->v_type == VDIR);
+       sa_bulk_attr_t bulk[5];
+       uint64_t mtime[2], ctime[2];
+       int count = 0;
        int error;
 
-       dmu_buf_will_dirty(zp->z_dbuf, tx);
        mutex_enter(&zp->z_lock);
 
        if (!(flag & ZRENAMING)) {
@@ -676,22 +716,47 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
                        mutex_exit(&zp->z_lock);
                        return (ENOENT);
                }
-               zp->z_phys->zp_links++;
+               zp->z_links++;
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+                   &zp->z_links, sizeof (zp->z_links));
+
        }
-       zp->z_phys->zp_parent = dzp->z_id;      /* dzp is now zp's parent */
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+           &dzp->z_id, sizeof (dzp->z_id));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, sizeof (zp->z_pflags));
+
+       if (!(flag & ZNEW)) {
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+                   ctime, sizeof (ctime));
+               zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+                   ctime, B_TRUE);
+       }
+       error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+       ASSERT(error == 0);
 
-       if (!(flag & ZNEW))
-               zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
        mutex_exit(&zp->z_lock);
 
-       dmu_buf_will_dirty(dzp->z_dbuf, tx);
        mutex_enter(&dzp->z_lock);
-       dzp->z_phys->zp_size++;                 /* one dirent added */
-       dzp->z_phys->zp_links += zp_is_dir;     /* ".." link from zp */
-       zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+       dzp->z_size++;
+       dzp->z_links += zp_is_dir;
+       count = 0;
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+           &dzp->z_size, sizeof (dzp->z_size));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+           &dzp->z_links, sizeof (dzp->z_links));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+           mtime, sizeof (mtime));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+           ctime, sizeof (ctime));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &dzp->z_pflags, sizeof (dzp->z_pflags));
+       zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+       error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+       ASSERT(error == 0);
        mutex_exit(&dzp->z_lock);
 
-       value = zfs_dirent(zp);
+       value = zfs_dirent(zp, zp->z_mode);
        error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
            8, 1, &value, tx);
        ASSERT(error == 0);
@@ -701,6 +766,30 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
        return (0);
 }
 
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+    int flag)
+{
+       int error;
+
+       if (zp->z_zfsvfs->z_norm) {
+               if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
+                   (flag & ZCIEXACT)) ||
+                   ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
+                   !(flag & ZCILOOK)))
+                       error = zap_remove_norm(zp->z_zfsvfs->z_os,
+                           dzp->z_id, dl->dl_name, MT_EXACT, tx);
+               else
+                       error = zap_remove_norm(zp->z_zfsvfs->z_os,
+                           dzp->z_id, dl->dl_name, MT_FIRST, tx);
+       } else {
+               error = zap_remove(zp->z_zfsvfs->z_os,
+                   dzp->z_id, dl->dl_name, tx);
+       }
+
+       return (error);
+}
+
 /*
  * Unlink zp from dl, and mark zp for deletion if this was the last link.
  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
@@ -713,16 +802,18 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
        boolean_t *unlinkedp)
 {
        znode_t *dzp = dl->dl_dzp;
+       zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
        vnode_t *vp = ZTOV(zp);
        int zp_is_dir = (vp->v_type == VDIR);
        boolean_t unlinked = B_FALSE;
+       sa_bulk_attr_t bulk[5];
+       uint64_t mtime[2], ctime[2];
+       int count = 0;
        int error;
 
        dnlc_remove(ZTOV(dzp), dl->dl_name);
 
        if (!(flag & ZRENAMING)) {
-               dmu_buf_will_dirty(zp->z_dbuf, tx);
-
                if (vn_vfswlock(vp))            /* prevent new mounts on zp */
                        return (EBUSY);
 
@@ -732,52 +823,75 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
                }
 
                mutex_enter(&zp->z_lock);
-               if (zp_is_dir && !zfs_dirempty(zp)) {   /* dir not empty */
+
+               if (zp_is_dir && !zfs_dirempty(zp)) {
                        mutex_exit(&zp->z_lock);
                        vn_vfsunlock(vp);
                        return (EEXIST);
                }
-               if (zp->z_phys->zp_links <= zp_is_dir) {
+
+               /*
+                * If we get here, we are going to try to remove the object.
+                * First try removing the name from the directory; if that
+                * fails, return the error.
+                */
+               error = zfs_dropname(dl, zp, dzp, tx, flag);
+               if (error != 0) {
+                       mutex_exit(&zp->z_lock);
+                       vn_vfsunlock(vp);
+                       return (error);
+               }
+
+               if (zp->z_links <= zp_is_dir) {
                        zfs_panic_recover("zfs: link count on %s is %u, "
                            "should be at least %u",
                            zp->z_vnode->v_path ? zp->z_vnode->v_path :
-                           "<unknown>", (int)zp->z_phys->zp_links,
+                           "<unknown>", (int)zp->z_links,
                            zp_is_dir + 1);
-                       zp->z_phys->zp_links = zp_is_dir + 1;
+                       zp->z_links = zp_is_dir + 1;
                }
-               if (--zp->z_phys->zp_links == zp_is_dir) {
+               if (--zp->z_links == zp_is_dir) {
                        zp->z_unlinked = B_TRUE;
-                       zp->z_phys->zp_links = 0;
+                       zp->z_links = 0;
                        unlinked = B_TRUE;
                } else {
-                       zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+                       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+                           NULL, &ctime, sizeof (ctime));
+                       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+                           NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+                       zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+                           B_TRUE);
                }
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+                   NULL, &zp->z_links, sizeof (zp->z_links));
+               error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+               count = 0;
+               ASSERT(error == 0);
                mutex_exit(&zp->z_lock);
                vn_vfsunlock(vp);
+       } else {
+               error = zfs_dropname(dl, zp, dzp, tx, flag);
+               if (error != 0)
+                       return (error);
        }
 
-       dmu_buf_will_dirty(dzp->z_dbuf, tx);
        mutex_enter(&dzp->z_lock);
-       dzp->z_phys->zp_size--;                 /* one dirent removed */
-       dzp->z_phys->zp_links -= zp_is_dir;     /* ".." link from zp */
-       zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
-       mutex_exit(&dzp->z_lock);
-
-       if (zp->z_zfsvfs->z_norm) {
-               if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
-                   (flag & ZCIEXACT)) ||
-                   ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
-                   !(flag & ZCILOOK)))
-                       error = zap_remove_norm(zp->z_zfsvfs->z_os,
-                           dzp->z_id, dl->dl_name, MT_EXACT, tx);
-               else
-                       error = zap_remove_norm(zp->z_zfsvfs->z_os,
-                           dzp->z_id, dl->dl_name, MT_FIRST, tx);
-       } else {
-               error = zap_remove(zp->z_zfsvfs->z_os,
-                   dzp->z_id, dl->dl_name, tx);
-       }
+       dzp->z_size--;          /* one dirent removed */
+       dzp->z_links -= zp_is_dir;      /* ".." link from zp */
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+           NULL, &dzp->z_links, sizeof (dzp->z_links));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+           NULL, &dzp->z_size, sizeof (dzp->z_size));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+           NULL, ctime, sizeof (ctime));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+           NULL, mtime, sizeof (mtime));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+           NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+       zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+       error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
        ASSERT(error == 0);
+       mutex_exit(&dzp->z_lock);
 
        if (unlinkedp != NULL)
                *unlinkedp = unlinked;
@@ -795,7 +909,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
-       return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+       return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
 }
 
 int
@@ -807,6 +921,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
        int error;
        zfs_acl_ids_t acl_ids;
        boolean_t fuid_dirtied;
+       uint64_t parent;
 
        *xvpp = NULL;
 
@@ -821,28 +936,39 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
                return (EDQUOT);
        }
 
+top:
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+       dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+           ZFS_SA_BASE_ATTR_SIZE);
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
        dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
        fuid_dirtied = zfsvfs->z_fuid_dirty;
        if (fuid_dirtied)
                zfs_fuid_txhold(zfsvfs, tx);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
-               zfs_acl_ids_free(&acl_ids);
-               if (error == ERESTART)
+               if (error == ERESTART) {
                        dmu_tx_wait(tx);
+                       dmu_tx_abort(tx);
+                       goto top;
+               }
+               zfs_acl_ids_free(&acl_ids);
                dmu_tx_abort(tx);
                return (error);
        }
-       zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+       zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
 
        if (fuid_dirtied)
                zfs_fuid_sync(zfsvfs, tx);
 
-       ASSERT(xzp->z_phys->zp_parent == zp->z_id);
-       dmu_buf_will_dirty(zp->z_dbuf, tx);
-       zp->z_phys->zp_xattr = xzp->z_id;
+#ifdef DEBUG
+       error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+           &parent, sizeof (parent));
+       ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+       VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+           sizeof (xzp->z_id), tx));
 
        (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
            xzp, "", NULL, acl_ids.z_fuidp, vap);
@@ -887,7 +1013,6 @@ top:
                return (0);
        }
 
-       ASSERT(zp->z_phys->zp_xattr == 0);
 
        if (!(flags & CREATE_XATTR_DIR)) {
                zfs_dirent_unlock(dl);
@@ -942,20 +1067,14 @@ int
 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 {
        uid_t           uid;
-       uid_t           downer;
-       uid_t           fowner;
-       zfsvfs_t        *zfsvfs = zdp->z_zfsvfs;
 
        if (zdp->z_zfsvfs->z_replay)
                return (0);
 
-       if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+       if ((zdp->z_mode & S_ISVTX) == 0)
                return (0);
 
-       downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
-       fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
-
-       if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+       if ((uid = crgetuid(cr)) == zdp->z_uid || uid == zp->z_uid ||
            (ZTOV(zp)->v_type == VREG &&
            zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
                return (0);
index 8b7785f..0b48126 100644 (file)
@@ -28,6 +28,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 
 #include <sys/fm/fs/zfs.h>
 #include <sys/fm/protocol.h>
  * this pointer is set to NULL, and no ereport will be generated (since it
  * doesn't actually correspond to any particular device or piece of data,
  * and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs.  Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO.  When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
  */
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+#ifdef _KERNEL
+static void
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+    const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
     uint64_t stateoroffset, uint64_t size)
 {
-#ifdef _KERNEL
        nvlist_t *ereport, *detector;
+
        uint64_t ena;
        char class[64];
 
        /*
-        * If we are doing a spa_tryimport(), ignore errors.
+        * If we are doing a spa_tryimport() or in recovery mode,
+        * ignore errors.
         */
-       if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+       if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+           spa_load_state(spa) == SPA_LOAD_RECOVER)
                return;
 
        /*
@@ -108,7 +121,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
         * failed, don't bother logging any new ereports - we're just going to
         * get the same diagnosis anyway.
         */
-       if (spa->spa_load_state != SPA_LOAD_NONE &&
+       if (spa_load_state(spa) != SPA_LOAD_NONE &&
            spa->spa_last_open_failed)
                return;
 
@@ -147,9 +160,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
                         * not yet been asynchronously placed into the REMOVED
                         * state.
                         */
-                       if (zio->io_vd == vd &&
-                           !vdev_accessible(vd, zio) &&
-                           strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+                       if (zio->io_vd == vd && !vdev_accessible(vd, zio))
                                return;
 
                        /*
@@ -164,6 +175,15 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
                }
        }
 
+       /*
+        * For probe failure, we want to avoid posting ereports if we've
+        * already removed the device in the meantime.
+        */
+       if (vd != NULL &&
+           strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+           (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+               return;
+
        if ((ereport = fm_nvlist_create(NULL)) == NULL)
                return;
 
@@ -182,7 +202,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
         * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
         * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
         */
-       if (spa->spa_load_state != SPA_LOAD_NONE) {
+       if (spa_load_state(spa) != SPA_LOAD_NONE) {
                if (spa->spa_ena == 0)
                        spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
                ena = spa->spa_ena;
@@ -218,7 +238,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
            DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
            DATA_TYPE_UINT64, spa_guid(spa),
            FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
-           spa->spa_load_state, NULL);
+           spa_load_state(spa), NULL);
 
        if (spa != NULL) {
                fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
@@ -322,8 +342,339 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
                    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
                    DATA_TYPE_UINT64, stateoroffset, NULL);
        }
+
        mutex_exit(&spa->spa_errlist_lock);
 
+       *ereport_out = ereport;
+       *detector_out = detector;
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define        ZFM_MAX_INLINE          (128 / sizeof (uint64_t))
+
+#define        MAX_RANGES              16
+
+typedef struct zfs_ecksum_info {
+       /* histograms of set and cleared bits by bit number in a 64-bit word */
+       uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+       uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+       /* inline arrays of bits set and cleared. */
+       uint64_t zei_bits_set[ZFM_MAX_INLINE];
+       uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+       /*
+        * for each range, the number of bits set and cleared.  The Hamming
+        * distance between the good and bad buffers is the sum of them all.
+        */
+       uint32_t zei_range_sets[MAX_RANGES];
+       uint32_t zei_range_clears[MAX_RANGES];
+
+       struct zei_ranges {
+               uint32_t        zr_start;
+               uint32_t        zr_end;
+       } zei_ranges[MAX_RANGES];
+
+       size_t  zei_range_count;
+       uint32_t zei_mingap;
+       uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
+{
+       size_t i;
+       size_t bits = 0;
+       uint64_t value = BE_64(value_arg);
+
+       /* We store the bits in big-endian (largest-first) order */
+       for (i = 0; i < 64; i++) {
+               if (value & (1ull << i)) {
+                       hist[63 - i]++;
+                       ++bits;
+               }
+       }
+       /* update the count of bits changed */
+       *count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly.  zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that.  We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one.  We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+shrink_ranges(zfs_ecksum_info_t *eip)
+{
+       uint32_t mingap = UINT32_MAX;
+       uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+       size_t idx, output;
+       size_t max = eip->zei_range_count;
+
+       struct zei_ranges *r = eip->zei_ranges;
+
+       ASSERT3U(eip->zei_range_count, >, 0);
+       ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+       output = idx = 0;
+       while (idx < max - 1) {
+               uint32_t start = r[idx].zr_start;
+               uint32_t end = r[idx].zr_end;
+
+               while (idx < max - 1) {
+                       idx++;
+
+                       uint32_t nstart = r[idx].zr_start;
+                       uint32_t nend = r[idx].zr_end;
+
+                       uint32_t gap = nstart - end;
+                       if (gap < new_allowed_gap) {
+                               end = nend;
+                               continue;
+                       }
+                       if (gap < mingap)
+                               mingap = gap;
+                       break;
+               }
+               r[output].zr_start = start;
+               r[output].zr_end = end;
+               output++;
+       }
+       ASSERT3U(output, <, eip->zei_range_count);
+       eip->zei_range_count = output;
+       eip->zei_mingap = mingap;
+       eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+       struct zei_ranges *r = eip->zei_ranges;
+       size_t count = eip->zei_range_count;
+
+       if (count >= MAX_RANGES) {
+               shrink_ranges(eip);
+               count = eip->zei_range_count;
+       }
+       if (count == 0) {
+               eip->zei_mingap = UINT32_MAX;
+               eip->zei_allowed_mingap = 1;
+       } else {
+               int gap = start - r[count - 1].zr_end;
+
+               if (gap < eip->zei_allowed_mingap) {
+                       r[count - 1].zr_end = end;
+                       return;
+               }
+               if (gap < eip->zei_mingap)
+                       eip->zei_mingap = gap;
+       }
+       r[count].zr_start = start;
+       r[count].zr_end = end;
+       eip->zei_range_count++;
+}
+
+static size_t
+range_total_size(zfs_ecksum_info_t *eip)
+{
+       struct zei_ranges *r = eip->zei_ranges;
+       size_t count = eip->zei_range_count;
+       size_t result = 0;
+       size_t idx;
+
+       for (idx = 0; idx < count; idx++)
+               result += (r[idx].zr_end - r[idx].zr_start);
+
+       return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+    const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
+    boolean_t drop_if_identical)
+{
+       const uint64_t *good = (const uint64_t *)goodbuf;
+       const uint64_t *bad = (const uint64_t *)badbuf;
+
+       uint64_t allset = 0;
+       uint64_t allcleared = 0;
+
+       size_t nui64s = size / sizeof (uint64_t);
+
+       size_t inline_size;
+       int no_inline = 0;
+       size_t idx;
+       size_t range;
+
+       size_t offset = 0;
+       ssize_t start = -1;
+
+       zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+       /* don't do any annotation for injected checksum errors */
+       if (info != NULL && info->zbc_injected)
+               return (eip);
+
+       if (info != NULL && info->zbc_has_cksum) {
+               fm_payload_set(ereport,
+                   FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+                   DATA_TYPE_UINT64_ARRAY,
+                   sizeof (info->zbc_expected) / sizeof (uint64_t),
+                   (uint64_t *)&info->zbc_expected,
+                   FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+                   DATA_TYPE_UINT64_ARRAY,
+                   sizeof (info->zbc_actual) / sizeof (uint64_t),
+                   (uint64_t *)&info->zbc_actual,
+                   FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+                   DATA_TYPE_STRING,
+                   info->zbc_checksum_name,
+                   NULL);
+
+               if (info->zbc_byteswapped) {
+                       fm_payload_set(ereport,
+                           FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+                           DATA_TYPE_BOOLEAN, 1,
+                           NULL);
+               }
+       }
+
+       if (badbuf == NULL || goodbuf == NULL)
+               return (eip);
+
+       ASSERT3U(nui64s, <=, UINT16_MAX);
+       ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+       ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(size, <=, UINT32_MAX);
+
+       /* build up the range list by comparing the two buffers. */
+       for (idx = 0; idx < nui64s; idx++) {
+               if (good[idx] == bad[idx]) {
+                       if (start == -1)
+                               continue;
+
+                       add_range(eip, start, idx);
+                       start = -1;
+               } else {
+                       if (start != -1)
+                               continue;
+
+                       start = idx;
+               }
+       }
+       if (start != -1)
+               add_range(eip, start, idx);
+
+       /* See if it will fit in our inline buffers */
+       inline_size = range_total_size(eip);
+       if (inline_size > ZFM_MAX_INLINE)
+               no_inline = 1;
+
+       /*
+        * If there is no change and we want to drop if the buffers are
+        * identical, do so.
+        */
+       if (inline_size == 0 && drop_if_identical) {
+               kmem_free(eip, sizeof (*eip));
+               return (NULL);
+       }
+
+       /*
+        * Now walk through the ranges, filling in the details of the
+        * differences.  Also convert our uint64_t-array offsets to byte
+        * offsets.
+        */
+       for (range = 0; range < eip->zei_range_count; range++) {
+               size_t start = eip->zei_ranges[range].zr_start;
+               size_t end = eip->zei_ranges[range].zr_end;
+
+               for (idx = start; idx < end; idx++) {
+                       uint64_t set, cleared;
+
+                       // bits set in bad, but not in good
+                       set = ((~good[idx]) & bad[idx]);
+                       // bits set in good, but not in bad
+                       cleared = (good[idx] & (~bad[idx]));
+
+                       allset |= set;
+                       allcleared |= cleared;
+
+                       if (!no_inline) {
+                               ASSERT3U(offset, <, inline_size);
+                               eip->zei_bits_set[offset] = set;
+                               eip->zei_bits_cleared[offset] = cleared;
+                               offset++;
+                       }
+
+                       update_histogram(set, eip->zei_histogram_set,
+                           &eip->zei_range_sets[range]);
+                       update_histogram(cleared, eip->zei_histogram_cleared,
+                           &eip->zei_range_clears[range]);
+               }
+
+               /* convert to byte offsets */
+               eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
+               eip->zei_ranges[range].zr_end   *= sizeof (uint64_t);
+       }
+       eip->zei_allowed_mingap *= sizeof (uint64_t);
+       inline_size             *= sizeof (uint64_t);
+
+       /* fill in ereport */
+       fm_payload_set(ereport,
+           FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+           DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+           (uint32_t *)eip->zei_ranges,
+           FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+           DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+           FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+           DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+           FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+           DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+           NULL);
+
+       if (!no_inline) {
+               fm_payload_set(ereport,
+                   FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+                   DATA_TYPE_UINT8_ARRAY,
+                   inline_size, (uint8_t *)eip->zei_bits_set,
+                   FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+                   DATA_TYPE_UINT8_ARRAY,
+                   inline_size, (uint8_t *)eip->zei_bits_cleared,
+                   NULL);
+       } else {
+               fm_payload_set(ereport,
+                   FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+                   DATA_TYPE_UINT16_ARRAY,
+                   NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+                   FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+                   DATA_TYPE_UINT16_ARRAY,
+                   NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+                   NULL);
+       }
+       return (eip);
+}
+#endif
+
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+    uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+       nvlist_t *ereport = NULL;
+       nvlist_t *detector = NULL;
+
+       zfs_ereport_start(&ereport, &detector,
+           subclass, spa, vd, zio, stateoroffset, size);
+
+       if (ereport == NULL)
+               return;
+
        fm_ereport_post(ereport, EVCH_SLEEP);
 
        fm_nvlist_destroy(ereport, FM_NVA_FREE);
@@ -331,6 +682,122 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 #endif
 }
 
+void
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+    zio_bad_cksum_t *info)
+{
+       zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+       if (zio->io_vsd != NULL)
+               zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+       else
+               zio_vsd_default_cksum_report(zio, report, arg);
+
+       /* copy the checksum failure information if it was provided */
+       if (info != NULL) {
+               report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+               bcopy(info, report->zcr_ckinfo, sizeof (*info));
+       }
+
+       report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+       report->zcr_length = length;
+
+#ifdef _KERNEL
+       zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+           FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+       if (report->zcr_ereport == NULL) {
+               report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
+               kmem_free(report, sizeof (*report));
+               return;
+       }
+#endif
+
+       mutex_enter(&spa->spa_errlist_lock);
+       report->zcr_next = zio->io_logical->io_cksum_report;
+       zio->io_logical->io_cksum_report = report;
+       mutex_exit(&spa->spa_errlist_lock);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+    const void *good_data, const void *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+       zfs_ecksum_info_t *info = NULL;
+       info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+           good_data, bad_data, report->zcr_length, drop_if_identical);
+
+       if (info != NULL)
+               fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+
+       fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
+       fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
+       report->zcr_ereport = report->zcr_detector = NULL;
+
+       if (info != NULL)
+               kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+       if (rpt->zcr_ereport != NULL) {
+               fm_nvlist_destroy(rpt->zcr_ereport,
+                   FM_NVA_FREE);
+               fm_nvlist_destroy(rpt->zcr_detector,
+                   FM_NVA_FREE);
+       }
+#endif
+       rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+       if (rpt->zcr_ckinfo != NULL)
+               kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+       kmem_free(rpt, sizeof (*rpt));
+}
+
+void
+zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
+{
+#ifdef _KERNEL
+       fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+#endif
+}
+
+void
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length,
+    const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
+{
+#ifdef _KERNEL
+       nvlist_t *ereport = NULL;
+       nvlist_t *detector = NULL;
+       zfs_ecksum_info_t *info;
+
+       zfs_ereport_start(&ereport, &detector,
+           FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+       if (ereport == NULL)
+               return;
+
+       info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+           B_FALSE);
+
+       if (info != NULL)
+               fm_ereport_post(ereport, EVCH_SLEEP);
+
+       fm_nvlist_destroy(ereport, FM_NVA_FREE);
+       fm_nvlist_destroy(detector, FM_NVA_FREE);
+
+       if (info != NULL)
+               kmem_free(info, sizeof (*info));
+#endif
+}
+
 static void
 zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
 {
@@ -338,6 +805,9 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
        nvlist_t *resource;
        char class[64];
 
+       if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+               return;
+
        if ((resource = fm_nvlist_create(NULL)) == NULL)
                return;
 
@@ -379,3 +849,15 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 {
        zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
 }
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd)
+{
+       zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
+}
index e704b1c..8c0424e 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
-#include <sys/sunddi.h>
 #include <sys/dmu.h>
 #include <sys/avl.h>
 #include <sys/zap.h>
@@ -377,7 +375,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 
        rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
 
-       if (zfsvfs->z_fuid_obj)
+       if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
                domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
        else
                domain = nulldomain;
@@ -390,10 +388,26 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 void
 zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
 {
-       *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
-           cr, ZFS_OWNER);
-       *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
-           cr, ZFS_GROUP);
+       uint64_t fuid, fgid;
+       sa_bulk_attr_t bulk[2];
+       int count = 0;
+
+       if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs),
+                   NULL, &fuid, 8);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs),
+                   NULL, &fgid, 8);
+               VERIFY(0 == sa_bulk_lookup(zp->z_sa_hdl, bulk, count));
+       }
+       if (IS_EPHEMERAL(zp->z_uid))
+               *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+       else
+               *uidp = zp->z_uid;
+       if (IS_EPHEMERAL(zp->z_gid))
+               *gidp = zfs_fuid_map_id(zp->z_zfsvfs,
+                   zp->z_gid, cr, ZFS_GROUP);
+       else
+               *gidp = zp->z_gid;
 }
 
 uid_t
@@ -427,7 +441,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
  * If ACL has multiple domains, then keep only one copy of each unique
  * domain.
  */
-static void
+void
 zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
     uint64_t idx, uint64_t id, zfs_fuid_type_t type)
 {
@@ -488,6 +502,11 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
 
 /*
  * Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
  */
 uint64_t
 zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
@@ -503,18 +522,27 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
        VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
 
        ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
-       if (ksid) {
-               id = ksid_getid(ksid);
-       } else {
-               if (type == ZFS_OWNER)
-                       id = crgetuid(cr);
-               else
-                       id = crgetgid(cr);
+
+       if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+               id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+               if (IS_EPHEMERAL(id))
+                       return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
+
+               return ((uint64_t)id);
        }
 
-       if (!zfsvfs->z_use_fuids || (!IS_EPHEMERAL(id)))
+       /*
+        * ksid is present and FUID is supported
+        */
+       id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+       if (!IS_EPHEMERAL(id))
                return ((uint64_t)id);
 
+       if (type == ZFS_GROUP)
+               id = ksid_getid(ksid);
+
        rid = ksid_getrid(ksid);
        domain = ksid_getdomain(ksid);
 
index 9cb4081..de5fb1e 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
-#include <sys/vdev_impl.h>
+#include <sys/priv_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zvol.h>
+#include <sys/dsl_scan.h>
 #include <sharefs/share.h>
 #include <sys/dmu_objset.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
+#include "zfs_comutil.h"
 
 extern struct modlfs zfs_modlfs;
 
@@ -102,17 +104,20 @@ static const char *userquota_perms[] = {
 };
 
 static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
-static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+    cred_t *cr);
+static int zfs_check_clearable(char *dataset, nvlist_t *props,
+    nvlist_t **errors);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
-int zfs_set_prop_nvlist(const char *, nvlist_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **);
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
        const char *newfile;
-       char buf[256];
+       char buf[512];
        va_list adx;
 
        /*
@@ -175,22 +180,15 @@ history_str_get(zfs_cmd_t *zc)
 static boolean_t
 zfs_is_bootfs(const char *name)
 {
-       spa_t *spa;
-       boolean_t ret = B_FALSE;
-
-       if (spa_open(name, &spa, FTAG) == 0) {
-               if (spa->spa_bootfs) {
-                       objset_t *os;
+       objset_t *os;
 
-                       if (dmu_objset_open(name, DMU_OST_ZFS,
-                           DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-                               ret = (dmu_objset_id(os) == spa->spa_bootfs);
-                               dmu_objset_close(os);
-                       }
-               }
-               spa_close(spa, FTAG);
+       if (dmu_objset_hold(name, FTAG, &os) == 0) {
+               boolean_t ret;
+               ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
+               dmu_objset_rele(os, FTAG);
+               return (ret);
        }
-       return (ret);
+       return (B_FALSE);
 }
 
 /*
@@ -224,13 +222,17 @@ zpl_earlier_version(const char *name, int version)
        objset_t *os;
        boolean_t rc = B_TRUE;
 
-       if (dmu_objset_open(name, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+       if (dmu_objset_hold(name, FTAG, &os) == 0) {
                uint64_t zplversion;
 
+               if (dmu_objset_type(os) != DMU_OST_ZFS) {
+                       dmu_objset_rele(os, FTAG);
+                       return (B_TRUE);
+               }
+               /* XXX reading from non-owned objset */
                if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
                        rc = zplversion < version;
-               dmu_objset_close(os);
+               dmu_objset_rele(os, FTAG);
        }
        return (rc);
 }
@@ -330,9 +332,109 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
        return (error);
 }
 
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
 static int
-zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
+zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
 {
+       char            ds_hexsl[MAXNAMELEN];
+       bslabel_t       ds_sl, new_sl;
+       boolean_t       new_default = FALSE;
+       uint64_t        zoned;
+       int             needed_priv = -1;
+       int             error;
+
+       /* First get the existing dataset label. */
+       error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+           1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+       if (error)
+               return (EPERM);
+
+       if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+               new_default = TRUE;
+
+       /* The label must be translatable */
+       if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+               return (EINVAL);
+
+       /*
+        * In a non-global zone, disallow attempts to set a label that
+        * doesn't match that of the zone; otherwise no other checks
+        * are needed.
+        */
+       if (!INGLOBALZONE(curproc)) {
+               if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+                       return (EPERM);
+               return (0);
+       }
+
+       /*
+        * For global-zone datasets (i.e., those whose zoned property is
+        * "off", verify that the specified new label is valid for the
+        * global zone.
+        */
+       if (dsl_prop_get_integer(name,
+           zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+               return (EPERM);
+       if (!zoned) {
+               if (zfs_check_global_label(name, strval) != 0)
+                       return (EPERM);
+       }
+
+       /*
+        * If the existing dataset label is nondefault, check if the
+        * dataset is mounted (label cannot be changed while mounted).
+        * Get the zfsvfs; if there isn't one, then the dataset isn't
+        * mounted (or isn't a dataset, doesn't exist, ...).
+        */
+       if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+               objset_t *os;
+               static char *setsl_tag = "setsl_tag";
+
+               /*
+                * Try to own the dataset; abort if there is any error,
+                * (e.g., already mounted, in use, or other error).
+                */
+               error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
+                   setsl_tag, &os);
+               if (error)
+                       return (EPERM);
+
+               dmu_objset_disown(os, setsl_tag);
+
+               if (new_default) {
+                       needed_priv = PRIV_FILE_DOWNGRADE_SL;
+                       goto out_check;
+               }
+
+               if (hexstr_to_label(strval, &new_sl) != 0)
+                       return (EPERM);
+
+               if (blstrictdom(&ds_sl, &new_sl))
+                       needed_priv = PRIV_FILE_DOWNGRADE_SL;
+               else if (blstrictdom(&new_sl, &ds_sl))
+                       needed_priv = PRIV_FILE_UPGRADE_SL;
+       } else {
+               /* dataset currently has a default label */
+               if (!new_default)
+                       needed_priv = PRIV_FILE_UPGRADE_SL;
+       }
+
+out_check:
+       if (needed_priv != -1)
+               return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+       return (0);
+}
+
+static int
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+    cred_t *cr)
+{
+       char *strval;
+
        /*
         * Check permissions for special properties.
         */
@@ -354,16 +456,29 @@ zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
                         * quota on things *under* (ie. contained by)
                         * the thing they own.
                         */
-                       if (dsl_prop_get_integer(name, "zoned", &zoned,
+                       if (dsl_prop_get_integer(dsname, "zoned", &zoned,
                            setpoint))
                                return (EPERM);
-                       if (!zoned || strlen(name) <= strlen(setpoint))
+                       if (!zoned || strlen(dsname) <= strlen(setpoint))
                                return (EPERM);
                }
                break;
+
+       case ZFS_PROP_MLSLABEL:
+               if (!is_system_labeled())
+                       return (EPERM);
+
+               if (nvpair_value_string(propval, &strval) == 0) {
+                       int err;
+
+                       err = zfs_set_slabel_policy(dsname, strval, CRED());
+                       if (err != 0)
+                               return (err);
+               }
+               break;
        }
 
-       return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr));
+       return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
 }
 
 int
@@ -385,13 +500,8 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
 int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
 {
-       int error;
-       error = zfs_secpolicy_write_perms(zc->zc_name,
-           ZFS_DELEG_PERM_ROLLBACK, cr);
-       if (error == 0)
-               error = zfs_secpolicy_write_perms(zc->zc_name,
-                   ZFS_DELEG_PERM_MOUNT, cr);
-       return (error);
+       return (zfs_secpolicy_write_perms(zc->zc_name,
+           ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
 int
@@ -492,19 +602,34 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
 }
 
 /*
- * Must have sys_config privilege to check the iscsi permission
+ * Destroying snapshots with delegated permissions requires
+ * descendent mount and destroy permissions.
+ * Reassemble the full filesystem@snap name so dsl_deleg_access()
+ * can do the correct permission check.
+ *
+ * Since this routine is used when doing a recursive destroy of snapshots
+ * and destroying snapshots requires descendent permissions, a successfull
+ * check of the top level snapshot applies to snapshots of all descendent
+ * datasets as well.
  */
-/* ARGSUSED */
 static int
-zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
 {
-       return (secpolicy_zfs(cr));
+       int error;
+       char *dsname;
+
+       dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+
+       error = zfs_secpolicy_destroy_perms(dsname, cr);
+
+       strfree(dsname);
+       return (error);
 }
 
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
-       char    parentname[MAXNAMELEN];
+       char    parentname[MAXNAMELEN];
        int     error;
 
        if ((error = zfs_secpolicy_write_perms(from,
@@ -539,7 +664,7 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
 {
-       char    parentname[MAXNAMELEN];
+       char    parentname[MAXNAMELEN];
        objset_t *clone;
        int error;
 
@@ -548,20 +673,19 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
        if (error)
                return (error);
 
-       error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &clone);
+       error = dmu_objset_hold(zc->zc_name, FTAG, &clone);
 
        if (error == 0) {
                dsl_dataset_t *pclone = NULL;
                dsl_dir_t *dd;
-               dd = clone->os->os_dsl_dataset->ds_dir;
+               dd = clone->os_dsl_dataset->ds_dir;
 
                rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
                error = dsl_dataset_hold_obj(dd->dd_pool,
                    dd->dd_phys->dd_origin_obj, FTAG, &pclone);
                rw_exit(&dd->dd_pool->dp_config_rwlock);
                if (error) {
-                       dmu_objset_close(clone);
+                       dmu_objset_rele(clone, FTAG);
                        return (error);
                }
 
@@ -569,7 +693,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
                    ZFS_DELEG_PERM_MOUNT, cr);
 
                dsl_dataset_name(pclone, parentname);
-               dmu_objset_close(clone);
+               dmu_objset_rele(clone, FTAG);
                dsl_dataset_rele(pclone, FTAG);
                if (error == 0)
                        error = zfs_secpolicy_write_perms(parentname,
@@ -598,16 +722,8 @@ zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
-       int error;
-
-       if ((error = zfs_secpolicy_write_perms(name,
-           ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
-               return (error);
-
-       error = zfs_secpolicy_write_perms(name,
-           ZFS_DELEG_PERM_MOUNT, cr);
-
-       return (error);
+       return (zfs_secpolicy_write_perms(name,
+           ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
 static int
@@ -620,8 +736,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
 static int
 zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
 {
-       char    parentname[MAXNAMELEN];
-       int     error;
+       char    parentname[MAXNAMELEN];
+       int     error;
 
        if ((error = zfs_get_parent(zc->zc_name, parentname,
            sizeof (parentname))) != 0)
@@ -670,22 +786,6 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
 }
 
 /*
- * Just like zfs_secpolicy_config, except that we will check for
- * mount permission on the dataset for permission to create/remove
- * the minor nodes.
- */
-static int
-zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
-{
-       if (secpolicy_sys_config(cr, B_FALSE) != 0) {
-               return (dsl_deleg_access(zc->zc_name,
-                   ZFS_DELEG_PERM_MOUNT, cr));
-       }
-
-       return (0);
-}
-
-/*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
@@ -706,9 +806,8 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
                return (zfs_secpolicy_write_perms(zc->zc_name,
                    ZFS_DELEG_PERM_USERPROP, cr));
        } else {
-               if (!zfs_prop_inheritable(prop))
-                       return (EINVAL);
-               return (zfs_secpolicy_setprop(zc->zc_name, prop, cr));
+               return (zfs_secpolicy_setprop(zc->zc_name, prop,
+                   NULL, cr));
        }
 }
 
@@ -758,7 +857,8 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
 static int
 zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
 {
-       return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
+       return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+           NULL, cr));
 }
 
 static int
@@ -811,11 +911,46 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
 }
 
 static int
+fit_error_list(zfs_cmd_t *zc, nvlist_t **errors)
+{
+       size_t size;
+
+       VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+
+       if (size > zc->zc_nvlist_dst_size) {
+               nvpair_t *more_errors;
+               int n = 0;
+
+               if (zc->zc_nvlist_dst_size < 1024)
+                       return (ENOMEM);
+
+               VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0);
+               more_errors = nvlist_prev_nvpair(*errors, NULL);
+
+               do {
+                       nvpair_t *pair = nvlist_prev_nvpair(*errors,
+                           more_errors);
+                       VERIFY(nvlist_remove_nvpair(*errors, pair) == 0);
+                       n++;
+                       VERIFY(nvlist_size(*errors, &size,
+                           NV_ENCODE_NATIVE) == 0);
+               } while (size > zc->zc_nvlist_dst_size);
+
+               VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0);
+               VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0);
+               ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+               ASSERT(size <= zc->zc_nvlist_dst_size);
+       }
+
+       return (0);
+}
+
+static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
        char *packed = NULL;
+       int error = 0;
        size_t size;
-       int error;
 
        VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
 
@@ -825,8 +960,9 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
                packed = kmem_alloc(size, KM_SLEEP);
                VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
                    KM_SLEEP) == 0);
-               error = ddi_copyout(packed,
-                   (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags);
+               if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+                   size, zc->zc_iflags) != 0)
+                       error = EFAULT;
                kmem_free(packed, size);
        }
 
@@ -835,25 +971,28 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 }
 
 static int
-getzfsvfs(const char *dsname, zfsvfs_t **zvp)
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
 {
        objset_t *os;
        int error;
 
-       error = dmu_objset_open(dsname, DMU_OST_ZFS,
-           DS_MODE_USER | DS_MODE_READONLY, &os);
+       error = dmu_objset_hold(dsname, FTAG, &os);
        if (error)
                return (error);
+       if (dmu_objset_type(os) != DMU_OST_ZFS) {
+               dmu_objset_rele(os, FTAG);
+               return (EINVAL);
+       }
 
-       mutex_enter(&os->os->os_user_ptr_lock);
-       *zvp = dmu_objset_get_user(os);
-       if (*zvp) {
-               VFS_HOLD((*zvp)->z_vfs);
+       mutex_enter(&os->os_user_ptr_lock);
+       *zfvp = dmu_objset_get_user(os);
+       if (*zfvp) {
+               VFS_HOLD((*zfvp)->z_vfs);
        } else {
                error = ESRCH;
        }
-       mutex_exit(&os->os->os_user_ptr_lock);
-       dmu_objset_close(os);
+       mutex_exit(&os->os_user_ptr_lock);
+       dmu_objset_rele(os, FTAG);
        return (error);
 }
 
@@ -862,22 +1001,21 @@ getzfsvfs(const char *dsname, zfsvfs_t **zvp)
  * case its z_vfs will be NULL, and it will be opened as the owner.
  */
 static int
-zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp)
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp)
 {
        int error = 0;
-       int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0);
 
-       if (getzfsvfs(name, zvp) != 0)
-               error = zfsvfs_create(name, mode, zvp);
+       if (getzfsvfs(name, zfvp) != 0)
+               error = zfsvfs_create(name, zfvp);
        if (error == 0) {
-               rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag);
-               if ((*zvp)->z_unmounted) {
+               rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag);
+               if ((*zfvp)->z_unmounted) {
                        /*
                         * XXX we could probably try again, since the unmounting
                         * thread should be just about to disassociate the
                         * objset from the zfsvfs.
                         */
-                       rrw_exit(&(*zvp)->z_teardown_lock, tag);
+                       rrw_exit(&(*zfvp)->z_teardown_lock, tag);
                        return (EBUSY);
                }
        }
@@ -892,7 +1030,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
        if (zfsvfs->z_vfs) {
                VFS_RELE(zfsvfs->z_vfs);
        } else {
-               dmu_objset_close(zfsvfs->z_os);
+               dmu_objset_disown(zfsvfs->z_os, zfsvfs);
                zfsvfs_free(zfsvfs);
        }
 }
@@ -951,8 +1089,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
        /*
         * Set the remaining root properties
         */
-       if (!error &&
-           (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0)
+       if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
+           ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
                (void) spa_destroy(zc->zc_name);
 
        if (buf != NULL)
@@ -973,15 +1111,17 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
        int error;
        zfs_log_history(zc);
        error = spa_destroy(zc->zc_name);
+       if (error == 0)
+               zvol_remove_minors(zc->zc_name);
        return (error);
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
-       int error;
        nvlist_t *config, *props = NULL;
        uint64_t guid;
+       int error;
 
        if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
            zc->zc_iflags, &config)) != 0)
@@ -998,11 +1138,13 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
            guid != zc->zc_guid)
                error = EINVAL;
        else if (zc->zc_cookie)
-               error = spa_import_verbatim(zc->zc_name, config,
-                   props);
+               error = spa_import_verbatim(zc->zc_name, config, props);
        else
                error = spa_import(zc->zc_name, config, props);
 
+       if (zc->zc_nvlist_dst != 0)
+               (void) put_nvlist(zc, config);
+
        nvlist_free(config);
 
        if (props)
@@ -1020,6 +1162,8 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
 
        zfs_log_history(zc);
        error = spa_export(zc->zc_name, NULL, force, hardforce);
+       if (error == 0)
+               zvol_remove_minors(zc->zc_name);
        return (error);
 }
 
@@ -1093,8 +1237,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
        return (error);
 }
 
+/*
+ * inputs:
+ * zc_name              name of the pool
+ * zc_cookie            scan func (pool_scan_func_t)
+ */
 static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
 {
        spa_t *spa;
        int error;
@@ -1102,7 +1251,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
        if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
                return (error);
 
-       error = spa_scrub(spa, zc->zc_cookie);
+       if (zc->zc_cookie == POOL_SCAN_NONE)
+               error = spa_scan_stop(spa);
+       else
+               error = spa_scan(spa, zc->zc_cookie);
 
        spa_close(spa, FTAG);
 
@@ -1186,18 +1338,30 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
        return (0);
 }
 
+/*
+ * inputs:
+ * zc_name             name of filesystem
+ * zc_obj              object to find
+ *
+ * outputs:
+ * zc_value            name of object
+ */
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
-       objset_t *osp;
+       objset_t *os;
        int error;
 
-       if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
-           DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0)
+       /* XXX reading from objset not owned */
+       if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
                return (error);
-       error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
+       if (dmu_objset_type(os) != DMU_OST_ZFS) {
+               dmu_objset_rele(os, FTAG);
+               return (EINVAL);
+       }
+       error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
            sizeof (zc->zc_value));
-       dmu_objset_close(osp);
+       dmu_objset_rele(os, FTAG);
 
        return (error);
 }
@@ -1232,7 +1396,8 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
         *
         * l2cache and spare devices are ok to be added to a rootpool.
         */
-       if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
+       if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
+               nvlist_free(config);
                spa_close(spa, FTAG);
                return (EDOM);
        }
@@ -1245,6 +1410,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
        return (error);
 }
 
+/*
+ * inputs:
+ * zc_name             name of the pool
+ * zc_nvlist_conf      nvlist of devices to remove
+ * zc_cookie           to stop the remove?
+ */
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
@@ -1278,11 +1449,19 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
                break;
 
        case VDEV_STATE_FAULTED:
-               error = vdev_fault(spa, zc->zc_guid);
+               if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+                   zc->zc_obj != VDEV_AUX_EXTERNAL)
+                       zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+               error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
                break;
 
        case VDEV_STATE_DEGRADED:
-               error = vdev_degrade(spa, zc->zc_guid);
+               if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+                   zc->zc_obj != VDEV_AUX_EXTERNAL)
+                       zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+               error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
                break;
 
        default:
@@ -1330,6 +1509,41 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 }
 
 static int
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
+{
+       spa_t *spa;
+       nvlist_t *config, *props = NULL;
+       int error;
+       boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
+
+       if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+               return (error);
+
+       if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+           zc->zc_iflags, &config)) {
+               spa_close(spa, FTAG);
+               return (error);
+       }
+
+       if (zc->zc_nvlist_src_size != 0 && (error =
+           get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+           zc->zc_iflags, &props))) {
+               spa_close(spa, FTAG);
+               nvlist_free(config);
+               return (error);
+       }
+
+       error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+       spa_close(spa, FTAG);
+
+       nvlist_free(config);
+       nvlist_free(props);
+
+       return (error);
+}
+
+static int
 zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
        spa_t *spa;
@@ -1380,20 +1594,20 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
        int error;
        nvlist_t *nv;
 
-       if (error = dmu_objset_open(zc->zc_name,
-           DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+       if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
                return (error);
 
        dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
        if (zc->zc_nvlist_dst != 0 &&
-           (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) {
+           (error = dsl_prop_get_all(os, &nv)) == 0) {
                dmu_objset_stats(os, nv);
                /*
                 * NB: zvol_get_stats() will read the objset contents,
                 * which we aren't supposed to do with a
                 * DS_MODE_USER hold, because it could be
                 * inconsistent.  So this is a bit of a workaround...
+                * XXX reading with out owning
                 */
                if (!zc->zc_objset_stats.dds_inconsistent) {
                        if (dmu_objset_type(os) == DMU_OST_ZVOL)
@@ -1403,7 +1617,50 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
                nvlist_free(nv);
        }
 
-       dmu_objset_close(os);
+       dmu_objset_rele(os, FTAG);
+       return (error);
+}
+
+/*
+ * inputs:
+ * zc_name             name of filesystem
+ * zc_nvlist_dst_size  size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst       received property nvlist
+ * zc_nvlist_dst_size  size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+       objset_t *os = NULL;
+       int error;
+       nvlist_t *nv;
+
+       if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+               return (error);
+
+       /*
+        * Without this check, we would return local property values if the
+        * caller has not already received properties on or after
+        * SPA_VERSION_RECVD_PROPS.
+        */
+       if (!dsl_prop_get_hasrecvd(os)) {
+               dmu_objset_rele(os, FTAG);
+               return (ENOTSUP);
+       }
+
+       if (zc->zc_nvlist_dst != 0 &&
+           (error = dsl_prop_get_received(os, &nv)) == 0) {
+               error = put_nvlist(zc, nv);
+               nvlist_free(nv);
+       }
+
+       dmu_objset_rele(os, FTAG);
        return (error);
 }
 
@@ -1438,8 +1695,8 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
        objset_t *os;
        int err;
 
-       if (err = dmu_objset_open(zc->zc_name,
-           DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+       /* XXX reading without owning */
+       if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
                return (err);
 
        dmu_objset_fast_stat(os, &zc->zc_objset_stats);
@@ -1464,7 +1721,7 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
        } else {
                err = ENOENT;
        }
-       dmu_objset_close(os);
+       dmu_objset_rele(os, FTAG);
        return (err);
 }
 
@@ -1504,9 +1761,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
        objset_t *os;
        int error;
        char *p;
+       size_t orig_len = strlen(zc->zc_name);
 
-       if (error = dmu_objset_open(zc->zc_name,
-           DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) {
+top:
+       if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
                if (error == ENOENT)
                        error = ESRCH;
                return (error);
@@ -1535,12 +1793,22 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
                    NULL, &zc->zc_cookie);
                if (error == ENOENT)
                        error = ESRCH;
-       } while (error == 0 && dataset_name_hidden(zc->zc_name));
-       dmu_objset_close(os);
+       } while (error == 0 && dataset_name_hidden(zc->zc_name) &&
+           !(zc->zc_iflags & FKIOCTL));
+       dmu_objset_rele(os, FTAG);
 
-       if (error == 0)
+       /*
+        * If it's an internal dataset (ie. with a '$' in its name),
+        * don't try to get stats for it, otherwise we'll return ENOENT.
+        */
+       if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
                error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
+               if (error == ENOENT) {
+                       /* We lost a race with destroy, get the next one. */
+                       zc->zc_name[orig_len] = '\0';
+                       goto top;
+               }
+       }
        return (error);
 }
 
@@ -1562,32 +1830,38 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
        objset_t *os;
        int error;
 
-       error = dmu_objset_open(zc->zc_name,
-           DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os);
+top:
+       if (zc->zc_cookie == 0)
+               (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
+                   NULL, DS_FIND_SNAPSHOTS);
+
+       error = dmu_objset_hold(zc->zc_name, FTAG, &os);
        if (error)
                return (error == ENOENT ? ESRCH : error);
 
-       if (zc->zc_cookie == 0) {
-               (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
-                   NULL, DS_FIND_SNAPSHOTS);
-       }
        /*
         * A dataset name of maximum length cannot have any snapshots,
         * so exit immediately.
         */
        if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
-               dmu_objset_close(os);
+               dmu_objset_rele(os, FTAG);
                return (ESRCH);
        }
 
        error = dmu_snapshot_list_next(os,
            sizeof (zc->zc_name) - strlen(zc->zc_name),
            zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
-       dmu_objset_close(os);
-       if (error == 0)
+       dmu_objset_rele(os, FTAG);
+       if (error == 0) {
                error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-       else if (error == ENOENT)
+               if (error == ENOENT)  {
+                       /* We lost a race with destroy, get the next one. */
+                       *strchr(zc->zc_name, '@') = '\0';
+                       goto top;
+               }
+       } else if (error == ENOENT) {
                error = ESRCH;
+       }
 
        /* if we failed, undo the @ that we tacked on to zc_name */
        if (error)
@@ -1595,266 +1869,300 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
        return (error);
 }
 
-int
-zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
 {
-       nvpair_t *elem;
-       int error = 0;
-       uint64_t intval;
-       char *strval;
-       nvlist_t *genericnvl;
-       boolean_t issnap = (strchr(name, '@') != NULL);
+       const char *propname = nvpair_name(pair);
+       uint64_t *valary;
+       unsigned int vallen;
+       const char *domain;
+       char *dash;
+       zfs_userquota_prop_t type;
+       uint64_t rid;
+       uint64_t quota;
+       zfsvfs_t *zfsvfs;
+       int err;
+
+       if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+               nvlist_t *attrs;
+               VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+               if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                   &pair) != 0)
+                       return (EINVAL);
+       }
 
        /*
-        * First validate permission to set all of the properties
+        * A correctly constructed propname is encoded as
+        * userquota@<rid>-<domain>.
         */
-       elem = NULL;
-       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-               const char *propname = nvpair_name(elem);
-               zfs_prop_t prop = zfs_name_to_prop(propname);
+       if ((dash = strchr(propname, '-')) == NULL ||
+           nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+           vallen != 3)
+               return (EINVAL);
 
-               if (prop == ZPROP_INVAL) {
-                       /*
-                        * If this is a user-defined property, it must be a
-                        * string, and there is no further validation to do.
-                        */
-                       if (zfs_prop_user(propname) &&
-                           nvpair_type(elem) == DATA_TYPE_STRING) {
-                               if (error = zfs_secpolicy_write_perms(name,
-                                   ZFS_DELEG_PERM_USERPROP, CRED()))
-                                       return (error);
-                               continue;
-                       }
+       domain = dash + 1;
+       type = valary[0];
+       rid = valary[1];
+       quota = valary[2];
 
-                       if (!issnap && zfs_prop_userquota(propname) &&
-                           nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) {
-                               const char *perm;
-                               const char *up = zfs_userquota_prop_prefixes
-                                   [ZFS_PROP_USERQUOTA];
-                               if (strncmp(propname, up, strlen(up)) == 0)
-                                       perm = ZFS_DELEG_PERM_USERQUOTA;
-                               else
-                                       perm = ZFS_DELEG_PERM_GROUPQUOTA;
-                               if (error = zfs_secpolicy_write_perms(name,
-                                   perm, CRED()))
-                                       return (error);
-                               continue;
-                       }
+       err = zfsvfs_hold(dsname, FTAG, &zfsvfs);
+       if (err == 0) {
+               err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+               zfsvfs_rele(zfsvfs, FTAG);
+       }
 
-                       return (EINVAL);
-               }
+       return (err);
+}
 
-               if (issnap)
-                       return (EINVAL);
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the property interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+    nvpair_t *pair)
+{
+       const char *propname = nvpair_name(pair);
+       zfs_prop_t prop = zfs_name_to_prop(propname);
+       uint64_t intval;
+       int err;
 
-               if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
-                       return (error);
+       if (prop == ZPROP_INVAL) {
+               if (zfs_prop_userquota(propname))
+                       return (zfs_prop_set_userquota(dsname, pair));
+               return (-1);
+       }
 
-               /*
-                * Check that this value is valid for this pool version
-                */
-               switch (prop) {
-               case ZFS_PROP_COMPRESSION:
-                       /*
-                        * If the user specified gzip compression, make sure
-                        * the SPA supports it. We ignore any errors here since
-                        * we'll catch them later.
-                        */
-                       if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-                           nvpair_value_uint64(elem, &intval) == 0) {
-                               if (intval >= ZIO_COMPRESS_GZIP_1 &&
-                                   intval <= ZIO_COMPRESS_GZIP_9 &&
-                                   zfs_earlier_version(name,
-                                   SPA_VERSION_GZIP_COMPRESSION))
-                                       return (ENOTSUP);
+       if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+               nvlist_t *attrs;
+               VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+               VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                   &pair) == 0);
+       }
 
-                               /*
-                                * If this is a bootable dataset then
-                                * verify that the compression algorithm
-                                * is supported for booting. We must return
-                                * something other than ENOTSUP since it
-                                * implies a downrev pool version.
-                                */
-                               if (zfs_is_bootfs(name) &&
-                                   !BOOTFS_COMPRESS_VALID(intval))
-                                       return (ERANGE);
-                       }
-                       break;
+       if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
+               return (-1);
 
-               case ZFS_PROP_COPIES:
-                       if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS))
-                               return (ENOTSUP);
-                       break;
+       VERIFY(0 == nvpair_value_uint64(pair, &intval));
 
-               case ZFS_PROP_SHARESMB:
-                       if (zpl_earlier_version(name, ZPL_VERSION_FUID))
-                               return (ENOTSUP);
+       switch (prop) {
+       case ZFS_PROP_QUOTA:
+               err = dsl_dir_set_quota(dsname, source, intval);
+               break;
+       case ZFS_PROP_REFQUOTA:
+               err = dsl_dataset_set_quota(dsname, source, intval);
+               break;
+       case ZFS_PROP_RESERVATION:
+               err = dsl_dir_set_reservation(dsname, source, intval);
+               break;
+       case ZFS_PROP_REFRESERVATION:
+               err = dsl_dataset_set_reservation(dsname, source, intval);
+               break;
+       case ZFS_PROP_VOLSIZE:
+               err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
+                   intval);
+               break;
+       case ZFS_PROP_VERSION:
+       {
+               zfsvfs_t *zfsvfs;
+
+               if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0)
                        break;
 
-               case ZFS_PROP_ACLINHERIT:
-                       if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-                           nvpair_value_uint64(elem, &intval) == 0)
-                               if (intval == ZFS_ACL_PASSTHROUGH_X &&
-                                   zfs_earlier_version(name,
-                                   SPA_VERSION_PASSTHROUGH_X))
-                                       return (ENOTSUP);
-               }
-       }
+               err = zfs_set_version(zfsvfs, intval);
+               zfsvfs_rele(zfsvfs, FTAG);
 
-       VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-       elem = NULL;
-       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-               const char *propname = nvpair_name(elem);
-               zfs_prop_t prop = zfs_name_to_prop(propname);
+               if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+                       zfs_cmd_t *zc;
 
-               if (prop == ZPROP_INVAL) {
-                       if (zfs_prop_userquota(propname)) {
-                               uint64_t *valary;
-                               unsigned int vallen;
-                               const char *domain;
-                               zfs_userquota_prop_t type;
-                               uint64_t rid;
-                               uint64_t quota;
-                               zfsvfs_t *zfsvfs;
-
-                               VERIFY(nvpair_value_uint64_array(elem,
-                                   &valary, &vallen) == 0);
-                               VERIFY(vallen == 3);
-                               type = valary[0];
-                               rid = valary[1];
-                               quota = valary[2];
-                               domain = propname +
-                                   strlen(zfs_userquota_prop_prefixes[type]);
-
-                               error = zfsvfs_hold(name, B_FALSE, FTAG,
-                                   &zfsvfs);
-                               if (error == 0) {
-                                       error = zfs_set_userquota(zfsvfs,
-                                           type, domain, rid, quota);
-                                       zfsvfs_rele(zfsvfs, FTAG);
-                               }
-                               if (error == 0)
-                                       continue;
-                               else
-                                       goto out;
-                       } else if (zfs_prop_user(propname)) {
-                               VERIFY(nvpair_value_string(elem, &strval) == 0);
-                               error = dsl_prop_set(name, propname, 1,
-                                   strlen(strval) + 1, strval);
-                               if (error == 0)
-                                       continue;
-                               else
-                                       goto out;
-                       }
+                       zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+                       (void) strcpy(zc->zc_name, dsname);
+                       (void) zfs_ioc_userspace_upgrade(zc);
+                       kmem_free(zc, sizeof (zfs_cmd_t));
                }
+               break;
+       }
 
-               switch (prop) {
-               case ZFS_PROP_QUOTA:
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-                           (error = dsl_dir_set_quota(name, intval)) != 0)
-                               goto out;
-                       break;
-
-               case ZFS_PROP_REFQUOTA:
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-                           (error = dsl_dataset_set_quota(name, intval)) != 0)
-                               goto out;
-                       break;
-
-               case ZFS_PROP_RESERVATION:
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-                           (error = dsl_dir_set_reservation(name,
-                           intval)) != 0)
-                               goto out;
-                       break;
+       default:
+               err = -1;
+       }
 
-               case ZFS_PROP_REFRESERVATION:
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-                           (error = dsl_dataset_set_reservation(name,
-                           intval)) != 0)
-                               goto out;
-                       break;
+       return (err);
+}
 
-               case ZFS_PROP_VOLSIZE:
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-                           (error = zvol_set_volsize(name,
-                           ddi_driver_major(zfs_dip), intval)) != 0)
-                               goto out;
-                       break;
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the first error
+ * encountered. If the caller provides a non-NULL errlist, it also gives the
+ * complete list of names of all the properties it failed to set along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property is set successfully, zero is returned and the list pointed
+ * at by errlist is NULL.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+    nvlist_t **errlist)
+{
+       nvpair_t *pair;
+       nvpair_t *propval;
+       int rv = 0;
+       uint64_t intval;
+       char *strval;
+       nvlist_t *genericnvl;
+       nvlist_t *errors;
+       nvlist_t *retrynvl;
 
-               case ZFS_PROP_VOLBLOCKSIZE:
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-                           (error = zvol_set_volblocksize(name, intval)) != 0)
-                               goto out;
-                       break;
+       VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-               case ZFS_PROP_VERSION:
-               {
-                       zfsvfs_t *zfsvfs;
-
-                       if ((error = nvpair_value_uint64(elem, &intval)) != 0)
-                               goto out;
-                       if ((error = zfsvfs_hold(name, B_FALSE, FTAG,
-                           &zfsvfs)) != 0)
-                               goto out;
-                       error = zfs_set_version(zfsvfs, intval);
-                       zfsvfs_rele(zfsvfs, FTAG);
-
-                       if (error == 0 && intval >= ZPL_VERSION_USERSPACE) {
-                               zfs_cmd_t zc = { 0 };
-                               (void) strcpy(zc.zc_name, name);
-                               (void) zfs_ioc_userspace_upgrade(&zc);
-                       }
-                       if (error)
-                               goto out;
-                       break;
+retry:
+       pair = NULL;
+       while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+               const char *propname = nvpair_name(pair);
+               zfs_prop_t prop = zfs_name_to_prop(propname);
+               int err = 0;
+
+               /* decode the property value */
+               propval = pair;
+               if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+                       nvlist_t *attrs;
+                       VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+                       if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                           &propval) != 0)
+                               err = EINVAL;
                }
 
-               default:
-                       if (nvpair_type(elem) == DATA_TYPE_STRING) {
-                               if (zfs_prop_get_type(prop) !=
-                                   PROP_TYPE_STRING) {
-                                       error = EINVAL;
-                                       goto out;
-                               }
-                       } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+               /* Validate value type */
+               if (err == 0 && prop == ZPROP_INVAL) {
+                       if (zfs_prop_user(propname)) {
+                               if (nvpair_type(propval) != DATA_TYPE_STRING)
+                                       err = EINVAL;
+                       } else if (zfs_prop_userquota(propname)) {
+                               if (nvpair_type(propval) !=
+                                   DATA_TYPE_UINT64_ARRAY)
+                                       err = EINVAL;
+                       }
+               } else if (err == 0) {
+                       if (nvpair_type(propval) == DATA_TYPE_STRING) {
+                               if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+                                       err = EINVAL;
+                       } else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
                                const char *unused;
 
-                               VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+                               VERIFY(nvpair_value_uint64(propval,
+                                   &intval) == 0);
 
                                switch (zfs_prop_get_type(prop)) {
                                case PROP_TYPE_NUMBER:
                                        break;
                                case PROP_TYPE_STRING:
-                                       error = EINVAL;
-                                       goto out;
+                                       err = EINVAL;
+                                       break;
                                case PROP_TYPE_INDEX:
                                        if (zfs_prop_index_to_string(prop,
-                                           intval, &unused) != 0) {
-                                               error = EINVAL;
-                                               goto out;
-                                       }
+                                           intval, &unused) != 0)
+                                               err = EINVAL;
                                        break;
                                default:
                                        cmn_err(CE_PANIC,
                                            "unknown property type");
-                                       break;
                                }
                        } else {
-                               error = EINVAL;
-                               goto out;
+                               err = EINVAL;
                        }
-                       if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0)
-                               goto out;
                }
+
+               /* Validate permissions */
+               if (err == 0)
+                       err = zfs_check_settable(dsname, pair, CRED());
+
+               if (err == 0) {
+                       err = zfs_prop_set_special(dsname, source, pair);
+                       if (err == -1) {
+                               /*
+                                * For better performance we build up a list of
+                                * properties to set in a single transaction.
+                                */
+                               err = nvlist_add_nvpair(genericnvl, pair);
+                       } else if (err != 0 && nvl != retrynvl) {
+                               /*
+                                * This may be a spurious error caused by
+                                * receiving quota and reservation out of order.
+                                * Try again in a second pass.
+                                */
+                               err = nvlist_add_nvpair(retrynvl, pair);
+                       }
+               }
+
+               if (err != 0)
+                       VERIFY(nvlist_add_int32(errors, propname, err) == 0);
        }
 
-       if (nvlist_next_nvpair(genericnvl, NULL) != NULL) {
-               error = dsl_props_set(name, genericnvl);
+       if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+               nvl = retrynvl;
+               goto retry;
+       }
+
+       if (!nvlist_empty(genericnvl) &&
+           dsl_props_set(dsname, source, genericnvl) != 0) {
+               /*
+                * If this fails, we still want to set as many properties as we
+                * can, so try setting them individually.
+                */
+               pair = NULL;
+               while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+                       const char *propname = nvpair_name(pair);
+                       int err = 0;
+
+                       propval = pair;
+                       if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+                               nvlist_t *attrs;
+                               VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+                               VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                                   &propval) == 0);
+                       }
+
+                       if (nvpair_type(propval) == DATA_TYPE_STRING) {
+                               VERIFY(nvpair_value_string(propval,
+                                   &strval) == 0);
+                               err = dsl_prop_set(dsname, propname, source, 1,
+                                   strlen(strval) + 1, strval);
+                       } else {
+                               VERIFY(nvpair_value_uint64(propval,
+                                   &intval) == 0);
+                               err = dsl_prop_set(dsname, propname, source, 8,
+                                   1, &intval);
+                       }
+
+                       if (err != 0) {
+                               VERIFY(nvlist_add_int32(errors, propname,
+                                   err) == 0);
+                       }
+               }
        }
-out:
        nvlist_free(genericnvl);
-       return (error);
+       nvlist_free(retrynvl);
+
+       if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+               nvlist_free(errors);
+               errors = NULL;
+       } else {
+               VERIFY(nvpair_value_int32(pair, &rv) == 0);
+       }
+
+       if (errlist == NULL)
+               nvlist_free(errors);
+       else
+               *errlist = errors;
+
+       return (rv);
 }
 
 /*
@@ -1863,15 +2171,15 @@ out:
 static int
 zfs_check_userprops(char *fsname, nvlist_t *nvl)
 {
-       nvpair_t *elem = NULL;
+       nvpair_t *pair = NULL;
        int error = 0;
 
-       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-               const char *propname = nvpair_name(elem);
+       while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+               const char *propname = nvpair_name(pair);
                char *valstr;
 
                if (!zfs_prop_user(propname) ||
-                   nvpair_type(elem) != DATA_TYPE_STRING)
+                   nvpair_type(pair) != DATA_TYPE_STRING)
                        return (EINVAL);
 
                if (error = zfs_secpolicy_write_perms(fsname,
@@ -1881,49 +2189,96 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl)
                if (strlen(propname) >= ZAP_MAXNAMELEN)
                        return (ENAMETOOLONG);
 
-               VERIFY(nvpair_value_string(elem, &valstr) == 0);
+               VERIFY(nvpair_value_string(pair, &valstr) == 0);
                if (strlen(valstr) >= ZAP_MAXVALUELEN)
                        return (E2BIG);
        }
        return (0);
 }
 
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+       nvpair_t *pair;
+
+       VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+       pair = NULL;
+       while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+               if (nvlist_exists(skipped, nvpair_name(pair)))
+                       continue;
+
+               VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+       }
+}
+
+static int
+clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
+    nvlist_t *skipped)
+{
+       int err = 0;
+       nvlist_t *cleared_props = NULL;
+       props_skip(props, skipped, &cleared_props);
+       if (!nvlist_empty(cleared_props)) {
+               /*
+                * Acts on local properties until the dataset has received
+                * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+                */
+               zprop_source_t flags = (ZPROP_SRC_NONE |
+                   (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0));
+               err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL);
+       }
+       nvlist_free(cleared_props);
+       return (err);
+}
+
 /*
  * inputs:
  * zc_name             name of filesystem
  * zc_value            name of property to set
  * zc_nvlist_src{_size}        nvlist of properties to apply
- * zc_cookie           clear existing local props?
+ * zc_cookie           received properties flag
  *
- * outputs:            none
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
  */
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
        nvlist_t *nvl;
+       boolean_t received = zc->zc_cookie;
+       zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+           ZPROP_SRC_LOCAL);
+       nvlist_t *errors = NULL;
        int error;
 
        if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
            zc->zc_iflags, &nvl)) != 0)
                return (error);
 
-       if (zc->zc_cookie) {
+       if (received) {
                nvlist_t *origprops;
                objset_t *os;
 
-               if (dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-                   DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-                       if (dsl_prop_get_all(os, &origprops, TRUE) == 0) {
-                               clear_props(zc->zc_name, origprops, nvl);
+               if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) {
+                       if (dsl_prop_get_received(os, &origprops) == 0) {
+                               (void) clear_received_props(os,
+                                   zc->zc_name, origprops, nvl);
                                nvlist_free(origprops);
                        }
-                       dmu_objset_close(os);
-               }
 
+                       dsl_prop_set_hasrecvd(os);
+                       dmu_objset_rele(os, FTAG);
+               }
        }
 
-       error = zfs_set_prop_nvlist(zc->zc_name, nvl);
+       error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors);
 
+       if (zc->zc_nvlist_dst != NULL && errors != NULL) {
+               (void) put_nvlist(zc, errors);
+       }
+
+       nvlist_free(errors);
        nvlist_free(nvl);
        return (error);
 }
@@ -1932,14 +2287,75 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
  * inputs:
  * zc_name             name of filesystem
  * zc_value            name of property to inherit
+ * zc_cookie           revert to received value if TRUE
  *
  * outputs:            none
  */
 static int
 zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 {
+       const char *propname = zc->zc_value;
+       zfs_prop_t prop = zfs_name_to_prop(propname);
+       boolean_t received = zc->zc_cookie;
+       zprop_source_t source = (received
+           ? ZPROP_SRC_NONE            /* revert to received value, if any */
+           : ZPROP_SRC_INHERITED);     /* explicitly inherit */
+
+       if (received) {
+               nvlist_t *dummy;
+               nvpair_t *pair;
+               zprop_type_t type;
+               int err;
+
+               /*
+                * zfs_prop_set_special() expects properties in the form of an
+                * nvpair with type info.
+                */
+               if (prop == ZPROP_INVAL) {
+                       if (!zfs_prop_user(propname))
+                               return (EINVAL);
+
+                       type = PROP_TYPE_STRING;
+               } else if (prop == ZFS_PROP_VOLSIZE ||
+                   prop == ZFS_PROP_VERSION) {
+                       return (EINVAL);
+               } else {
+                       type = zfs_prop_get_type(prop);
+               }
+
+               VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+               switch (type) {
+               case PROP_TYPE_STRING:
+                       VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+                       break;
+               case PROP_TYPE_NUMBER:
+               case PROP_TYPE_INDEX:
+                       VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+                       break;
+               default:
+                       nvlist_free(dummy);
+                       return (EINVAL);
+               }
+
+               pair = nvlist_next_nvpair(dummy, NULL);
+               err = zfs_prop_set_special(zc->zc_name, source, pair);
+               nvlist_free(dummy);
+               if (err != -1)
+                       return (err); /* special property already handled */
+       } else {
+               /*
+                * Only check this in the non-received case. We want to allow
+                * 'inherit -S' to revert non-inheritable properties like quota
+                * and reservation to the received or default values even though
+                * they are not considered inheritable.
+                */
+               if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+                       return (EINVAL);
+       }
+
        /* the property name has been validated by zfs_secpolicy_inherit() */
-       return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+       return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL));
 }
 
 static int
@@ -1948,28 +2364,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
        nvlist_t *props;
        spa_t *spa;
        int error;
-       nvpair_t *elem;
+       nvpair_t *pair;
 
-       if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-           zc->zc_iflags, &props)))
+       if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+           zc->zc_iflags, &props))
                return (error);
 
        /*
         * If the only property is the configfile, then just do a spa_lookup()
         * to handle the faulted case.
         */
-       elem = nvlist_next_nvpair(props, NULL);
-       if (elem != NULL && strcmp(nvpair_name(elem),
+       pair = nvlist_next_nvpair(props, NULL);
+       if (pair != NULL && strcmp(nvpair_name(pair),
            zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
-           nvlist_next_nvpair(props, elem) == NULL) {
+           nvlist_next_nvpair(props, pair) == NULL) {
                mutex_enter(&spa_namespace_lock);
                if ((spa = spa_lookup(zc->zc_name)) != NULL) {
                        spa_configfile_set(spa, props, B_FALSE);
                        spa_config_sync(spa, B_FALSE, B_TRUE);
                }
                mutex_exit(&spa_namespace_lock);
-               if (spa != NULL)
+               if (spa != NULL) {
+                       nvlist_free(props);
                        return (0);
+               }
        }
 
        if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
@@ -2016,53 +2434,6 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
        return (error);
 }
 
-static int
-zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
-{
-       nvlist_t *nvp;
-       int error;
-       uint32_t uid;
-       uint32_t gid;
-       uint32_t *groups;
-       uint_t group_cnt;
-       cred_t  *usercred;
-
-       if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-           zc->zc_iflags, &nvp)) != 0) {
-               return (error);
-       }
-
-       if ((error = nvlist_lookup_uint32(nvp,
-           ZFS_DELEG_PERM_UID, &uid)) != 0) {
-               nvlist_free(nvp);
-               return (EPERM);
-       }
-
-       if ((error = nvlist_lookup_uint32(nvp,
-           ZFS_DELEG_PERM_GID, &gid)) != 0) {
-               nvlist_free(nvp);
-               return (EPERM);
-       }
-
-       if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS,
-           &groups, &group_cnt)) != 0) {
-               nvlist_free(nvp);
-               return (EPERM);
-       }
-       usercred = cralloc();
-       if ((crsetugid(usercred, uid, gid) != 0) ||
-           (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) {
-               nvlist_free(nvp);
-               crfree(usercred);
-               return (EPERM);
-       }
-       nvlist_free(nvp);
-       error = dsl_deleg_access(zc->zc_name,
-           zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
-       crfree(usercred);
-       return (error);
-}
-
 /*
  * inputs:
  * zc_name             name of filesystem
@@ -2135,30 +2506,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 }
 
 /*
- * inputs:
- * zc_name             name of volume
- *
- * outputs:            none
- */
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
-       return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
-}
-
-/*
- * inputs:
- * zc_name             name of volume
- *
- * outputs:            none
- */
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
-       return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
  * is responsible for releasing the returned vfs pointer.
@@ -2216,8 +2563,8 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
  */
 static int
 zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
-    boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
-    boolean_t *is_ci)
+    boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
 {
        uint64_t sense = ZFS_PROP_UNDEFINED;
        uint64_t norm = ZFS_PROP_UNDEFINED;
@@ -2253,6 +2600,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
         */
        if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
            (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+           (zplver >= ZPL_VERSION_SA && !sa_ok) ||
            (zplver < ZPL_VERSION_NORMALIZATION &&
            (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
            sense != ZFS_PROP_UNDEFINED)))
@@ -2294,11 +2642,13 @@ static int
 zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
-       boolean_t fuids_ok = B_TRUE;
+       boolean_t fuids_ok, sa_ok;
        uint64_t zplver = ZPL_VERSION;
        objset_t *os = NULL;
        char parentname[MAXNAMELEN];
        char *cp;
+       spa_t *spa;
+       uint64_t spa_vers;
        int error;
 
        (void) strlcpy(parentname, dataset, sizeof (parentname));
@@ -2306,23 +2656,25 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
        ASSERT(cp != NULL);
        cp[0] = '\0';
 
-       if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
-               zplver = ZPL_VERSION_USERSPACE - 1;
-       if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
-               zplver = ZPL_VERSION_FUID - 1;
-               fuids_ok = B_FALSE;
-       }
+       if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+               return (error);
+
+       spa_vers = spa_version(spa);
+       spa_close(spa, FTAG);
+
+       zplver = zfs_zpl_version_map(spa_vers);
+       fuids_ok = (zplver >= ZPL_VERSION_FUID);
+       sa_ok = (zplver >= ZPL_VERSION_SA);
 
        /*
         * Open parent object set so we can inherit zplprop values.
         */
-       if ((error = dmu_objset_open(parentname, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &os)) != 0)
+       if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
                return (error);
 
-       error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
+       error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
            zplprops, is_ci);
-       dmu_objset_close(os);
+       dmu_objset_rele(os, FTAG);
        return (error);
 }
 
@@ -2330,17 +2682,17 @@ static int
 zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
-       boolean_t fuids_ok = B_TRUE;
+       boolean_t fuids_ok;
+       boolean_t sa_ok;
        uint64_t zplver = ZPL_VERSION;
        int error;
 
-       if (spa_vers < SPA_VERSION_FUID) {
-               zplver = ZPL_VERSION_FUID - 1;
-               fuids_ok = B_FALSE;
-       }
+       zplver = zfs_zpl_version_map(spa_vers);
+       fuids_ok = (zplver >= ZPL_VERSION_FUID);
+       sa_ok = (zplver >= ZPL_VERSION_SA);
 
-       error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops,
-           zplprops, is_ci);
+       error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+           createprops, zplprops, is_ci);
        return (error);
 }
 
@@ -2399,21 +2751,18 @@ zfs_ioc_create(zfs_cmd_t *zc)
                        return (EINVAL);
                }
 
-               error = dmu_objset_open(zc->zc_value, type,
-                   DS_MODE_USER | DS_MODE_READONLY, &clone);
+               error = dmu_objset_hold(zc->zc_value, FTAG, &clone);
                if (error) {
                        nvlist_free(nvprops);
                        return (error);
                }
 
-               error = dmu_objset_create(zc->zc_name, type, clone, 0,
-                   NULL, NULL);
+               error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0);
+               dmu_objset_rele(clone, FTAG);
                if (error) {
-                       dmu_objset_close(clone);
                        nvlist_free(nvprops);
                        return (error);
                }
-               dmu_objset_close(clone);
        } else {
                boolean_t is_insensitive = B_FALSE;
 
@@ -2470,7 +2819,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
                                return (error);
                        }
                }
-               error = dmu_objset_create(zc->zc_name, type, NULL,
+               error = dmu_objset_create(zc->zc_name, type,
                    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
                nvlist_free(zct.zct_zplprops);
        }
@@ -2479,7 +2828,9 @@ zfs_ioc_create(zfs_cmd_t *zc)
         * It would be nice to do this atomically.
         */
        if (error == 0) {
-               if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
+               error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL,
+                   nvprops, NULL);
+               if (error != 0)
                        (void) dmu_objset_destroy(zc->zc_name, B_FALSE);
        }
        nvlist_free(nvprops);
@@ -2493,7 +2844,8 @@ zfs_ioc_create(zfs_cmd_t *zc)
  * zc_cookie   recursive flag
  * zc_nvlist_src[_size] property list
  *
- * outputs:    none
+ * outputs:
+ * zc_value    short snapname (i.e. part after the '@')
  */
 static int
 zfs_ioc_snapshot(zfs_cmd_t *zc)
@@ -2514,7 +2866,7 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
        if (error)
                goto out;
 
-       if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL &&
+       if (!nvlist_empty(nvprops) &&
            zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
                error = ENOTSUP;
                goto out;
@@ -2529,20 +2881,15 @@ out:
 }
 
 int
-zfs_unmount_snap(char *name, void *arg)
+zfs_unmount_snap(const char *name, void *arg)
 {
        vfs_t *vfsp = NULL;
 
        if (arg) {
                char *snapname = arg;
-               int len = strlen(name) + strlen(snapname) + 2;
-               char *buf = kmem_alloc(len, KM_SLEEP);
-
-               (void) strcpy(buf, name);
-               (void) strcat(buf, "@");
-               (void) strcat(buf, snapname);
-               vfsp = zfs_get_vfs(buf);
-               kmem_free(buf, len);
+               char *fullname = kmem_asprintf("%s@%s", name, snapname);
+               vfsp = zfs_get_vfs(fullname);
+               strfree(fullname);
        } else if (strchr(name, '@')) {
                vfsp = zfs_get_vfs(name);
        }
@@ -2599,13 +2946,17 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
+       int err;
        if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
-               int err = zfs_unmount_snap(zc->zc_name, NULL);
+               err = zfs_unmount_snap(zc->zc_name, NULL);
                if (err)
                        return (err);
        }
 
-       return (dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy));
+       err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
+       if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+               (void) zvol_remove_minor(zc->zc_name);
+       return (err);
 }
 
 /*
@@ -2617,38 +2968,78 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
-       objset_t *os;
+       dsl_dataset_t *ds, *clone;
        int error;
-       zfsvfs_t *zfsvfs = NULL;
+       zfsvfs_t *zfsvfs;
+       char *clone_name;
+
+       error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
+       if (error)
+               return (error);
+
+       /* must not be a snapshot */
+       if (dsl_dataset_is_snapshot(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EINVAL);
+       }
+
+       /* must have a most recent snapshot */
+       if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+               dsl_dataset_rele(ds, FTAG);
+               return (EINVAL);
+       }
 
        /*
-        * Get the zfsvfs for the receiving objset. There
-        * won't be one if we're operating on a zvol, if the
-        * objset doesn't exist yet, or is not mounted.
+        * Create clone of most recent snapshot.
         */
-       error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os);
+       clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
+       error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
        if (error)
-               return (error);
+               goto out;
 
-       if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
-               int mode;
+       error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone);
+       if (error)
+               goto out;
 
-               error = zfs_suspend_fs(zfsvfs, NULL, &mode);
+       /*
+        * Do clone swap.
+        */
+       if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+               error = zfs_suspend_fs(zfsvfs);
                if (error == 0) {
                        int resume_err;
 
-                       error = dmu_objset_rollback(os);
-                       resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode);
+                       if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+                               error = dsl_dataset_clone_swap(clone, ds,
+                                   B_TRUE);
+                               dsl_dataset_disown(ds, FTAG);
+                               ds = NULL;
+                       } else {
+                               error = EBUSY;
+                       }
+                       resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
                        error = error ? error : resume_err;
-               } else {
-                       dmu_objset_close(os);
                }
                VFS_RELE(zfsvfs->z_vfs);
        } else {
-               error = dmu_objset_rollback(os);
+               if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+                       error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
+                       dsl_dataset_disown(ds, FTAG);
+                       ds = NULL;
+               } else {
+                       error = EBUSY;
+               }
        }
-       /* Note, the dmu_objset_rollback() releases the objset for us. */
 
+       /*
+        * Destroy clone (which also closes it).
+        */
+       (void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+
+out:
+       strfree(clone_name);
+       if (ds)
+               dsl_dataset_rele(ds, FTAG);
        return (error);
 }
 
@@ -2681,32 +3072,268 @@ zfs_ioc_rename(zfs_cmd_t *zc)
                if (err)
                        return (err);
        }
+       if (zc->zc_objset_type == DMU_OST_ZVOL)
+               (void) zvol_remove_minor(zc->zc_name);
        return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
-static void
-clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+       const char *propname = nvpair_name(pair);
+       boolean_t issnap = (strchr(dsname, '@') != NULL);
+       zfs_prop_t prop = zfs_name_to_prop(propname);
+       uint64_t intval;
+       int err;
+
+       if (prop == ZPROP_INVAL) {
+               if (zfs_prop_user(propname)) {
+                       if (err = zfs_secpolicy_write_perms(dsname,
+                           ZFS_DELEG_PERM_USERPROP, cr))
+                               return (err);
+                       return (0);
+               }
+
+               if (!issnap && zfs_prop_userquota(propname)) {
+                       const char *perm = NULL;
+                       const char *uq_prefix =
+                           zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+                       const char *gq_prefix =
+                           zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+
+                       if (strncmp(propname, uq_prefix,
+                           strlen(uq_prefix)) == 0) {
+                               perm = ZFS_DELEG_PERM_USERQUOTA;
+                       } else if (strncmp(propname, gq_prefix,
+                           strlen(gq_prefix)) == 0) {
+                               perm = ZFS_DELEG_PERM_GROUPQUOTA;
+                       } else {
+                               /* USERUSED and GROUPUSED are read-only */
+                               return (EINVAL);
+                       }
+
+                       if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
+                               return (err);
+                       return (0);
+               }
+
+               return (EINVAL);
+       }
+
+       if (issnap)
+               return (EINVAL);
+
+       if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+               /*
+                * dsl_prop_get_all_impl() returns properties in this
+                * format.
+                */
+               nvlist_t *attrs;
+               VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+               VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                   &pair) == 0);
+       }
+
+       /*
+        * Check that this value is valid for this pool version
+        */
+       switch (prop) {
+       case ZFS_PROP_COMPRESSION:
+               /*
+                * If the user specified gzip compression, make sure
+                * the SPA supports it. We ignore any errors here since
+                * we'll catch them later.
+                */
+               if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+                   nvpair_value_uint64(pair, &intval) == 0) {
+                       if (intval >= ZIO_COMPRESS_GZIP_1 &&
+                           intval <= ZIO_COMPRESS_GZIP_9 &&
+                           zfs_earlier_version(dsname,
+                           SPA_VERSION_GZIP_COMPRESSION)) {
+                               return (ENOTSUP);
+                       }
+
+                       if (intval == ZIO_COMPRESS_ZLE &&
+                           zfs_earlier_version(dsname,
+                           SPA_VERSION_ZLE_COMPRESSION))
+                               return (ENOTSUP);
+
+                       /*
+                        * If this is a bootable dataset then
+                        * verify that the compression algorithm
+                        * is supported for booting. We must return
+                        * something other than ENOTSUP since it
+                        * implies a downrev pool version.
+                        */
+                       if (zfs_is_bootfs(dsname) &&
+                           !BOOTFS_COMPRESS_VALID(intval)) {
+                               return (ERANGE);
+                       }
+               }
+               break;
+
+       case ZFS_PROP_COPIES:
+               if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+                       return (ENOTSUP);
+               break;
+
+       case ZFS_PROP_DEDUP:
+               if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+                       return (ENOTSUP);
+               break;
+
+       case ZFS_PROP_SHARESMB:
+               if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+                       return (ENOTSUP);
+               break;
+
+       case ZFS_PROP_ACLINHERIT:
+               if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+                   nvpair_value_uint64(pair, &intval) == 0) {
+                       if (intval == ZFS_ACL_PASSTHROUGH_X &&
+                           zfs_earlier_version(dsname,
+                           SPA_VERSION_PASSTHROUGH_X))
+                               return (ENOTSUP);
+               }
+               break;
+       }
+
+       return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
 {
        zfs_cmd_t *zc;
-       nvpair_t *prop;
+       nvpair_t *pair, *next_pair;
+       nvlist_t *errors;
+       int err, rv = 0;
 
        if (props == NULL)
-               return;
+               return (0);
+
+       VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
        zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
        (void) strcpy(zc->zc_name, dataset);
-       for (prop = nvlist_next_nvpair(props, NULL); prop;
-           prop = nvlist_next_nvpair(props, prop)) {
-               if (newprops != NULL &&
-                   nvlist_exists(newprops, nvpair_name(prop)))
-                       continue;
-               (void) strcpy(zc->zc_value, nvpair_name(prop));
-               if (zfs_secpolicy_inherit(zc, CRED()) == 0)
-                       (void) zfs_ioc_inherit_prop(zc);
+       pair = nvlist_next_nvpair(props, NULL);
+       while (pair != NULL) {
+               next_pair = nvlist_next_nvpair(props, pair);
+
+               (void) strcpy(zc->zc_value, nvpair_name(pair));
+               if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+                   (err = zfs_secpolicy_inherit(zc, CRED())) != 0) {
+                       VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+                       VERIFY(nvlist_add_int32(errors,
+                           zc->zc_value, err) == 0);
+               }
+               pair = next_pair;
        }
        kmem_free(zc, sizeof (zfs_cmd_t));
+
+       if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+               nvlist_free(errors);
+               errors = NULL;
+       } else {
+               VERIFY(nvpair_value_int32(pair, &rv) == 0);
+       }
+
+       if (errlist == NULL)
+               nvlist_free(errors);
+       else
+               *errlist = errors;
+
+       return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+       if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+               /* dsl_prop_get_all_impl() format */
+               nvlist_t *attrs;
+               VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+               VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                   &p1) == 0);
+       }
+
+       if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+               nvlist_t *attrs;
+               VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+               VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+                   &p2) == 0);
+       }
+
+       if (nvpair_type(p1) != nvpair_type(p2))
+               return (B_FALSE);
+
+       if (nvpair_type(p1) == DATA_TYPE_STRING) {
+               char *valstr1, *valstr2;
+
+               VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+               VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+               return (strcmp(valstr1, valstr2) == 0);
+       } else {
+               uint64_t intval1, intval2;
+
+               VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+               VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+               return (intval1 == intval2);
+       }
 }
 
 /*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+       nvpair_t *pair, *next_pair;
+
+       if (origprops == NULL)
+               return; /* all props need to be received */
+
+       pair = nvlist_next_nvpair(props, NULL);
+       while (pair != NULL) {
+               const char *propname = nvpair_name(pair);
+               nvpair_t *match;
+
+               next_pair = nvlist_next_nvpair(props, pair);
+
+               if ((nvlist_lookup_nvpair(origprops, propname,
+                   &match) != 0) || !propval_equals(pair, match))
+                       goto next; /* need to set received value */
+
+               /* don't clear the existing received value */
+               (void) nvlist_remove_nvpair(origprops, match);
+               /* don't bother receiving the property */
+               (void) nvlist_remove_nvpair(props, pair);
+next:
+               pair = next_pair;
+       }
+}
+
+#ifdef DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
+/*
  * inputs:
  * zc_name             name of containing filesystem
  * zc_nvlist_src{_size}        nvlist of properties to apply
@@ -2718,6 +3345,8 @@ clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
  *
  * outputs:
  * zc_cookie           number of bytes read
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ * zc_obj              zprop_errflags_t
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
@@ -2726,13 +3355,17 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        objset_t *os;
        dmu_recv_cookie_t drc;
        boolean_t force = (boolean_t)zc->zc_guid;
-       int error, fd;
+       int fd;
+       int error = 0;
+       int props_error = 0;
+       nvlist_t *errors;
        offset_t off;
-       nvlist_t *props = NULL;
-       nvlist_t *origprops = NULL;
+       nvlist_t *props = NULL; /* sent properties */
+       nvlist_t *origprops = NULL; /* existing properties */
        objset_t *origin = NULL;
        char *tosnap;
        char tofs[ZFS_MAXNAMELEN];
+       boolean_t first_recvd_props = B_FALSE;
 
        if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
            strchr(zc->zc_value, '@') == NULL ||
@@ -2741,8 +3374,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 
        (void) strcpy(tofs, zc->zc_value);
        tosnap = strchr(tofs, '@');
-       *tosnap = '\0';
-       tosnap++;
+       *tosnap++ = '\0';
 
        if (zc->zc_nvlist_src != NULL &&
            (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
@@ -2756,41 +3388,90 @@ zfs_ioc_recv(zfs_cmd_t *zc)
                return (EBADF);
        }
 
-       if (props && dmu_objset_open(tofs, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+       VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+       if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) {
+               if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) &&
+                   !dsl_prop_get_hasrecvd(os)) {
+                       first_recvd_props = B_TRUE;
+               }
+
                /*
-                * If new properties are supplied, they are to completely
-                * replace the existing ones, so stash away the existing ones.
+                * If new received properties are supplied, they are to
+                * completely replace the existing received properties, so stash
+                * away the existing ones.
                 */
-               (void) dsl_prop_get_all(os, &origprops, TRUE);
+               if (dsl_prop_get_received(os, &origprops) == 0) {
+                       nvlist_t *errlist = NULL;
+                       /*
+                        * Don't bother writing a property if its value won't
+                        * change (and avoid the unnecessary security checks).
+                        *
+                        * The first receive after SPA_VERSION_RECVD_PROPS is a
+                        * special case where we blow away all local properties
+                        * regardless.
+                        */
+                       if (!first_recvd_props)
+                               props_reduce(props, origprops);
+                       if (zfs_check_clearable(tofs, origprops,
+                           &errlist) != 0)
+                               (void) nvlist_merge(errors, errlist, 0);
+                       nvlist_free(errlist);
+               }
 
-               dmu_objset_close(os);
+               dmu_objset_rele(os, FTAG);
        }
 
        if (zc->zc_string[0]) {
-               error = dmu_objset_open(zc->zc_string, DMU_OST_ANY,
-                   DS_MODE_USER | DS_MODE_READONLY, &origin);
+               error = dmu_objset_hold(zc->zc_string, FTAG, &origin);
                if (error)
                        goto out;
        }
 
-       error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
-           force, origin, &drc);
+       error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds,
+           &zc->zc_begin_record, force, origin, &drc);
        if (origin)
-               dmu_objset_close(origin);
+               dmu_objset_rele(origin, FTAG);
        if (error)
                goto out;
 
        /*
-        * Reset properties.  We do this before we receive the stream
-        * so that the properties are applied to the new data.
+        * Set properties before we receive the stream so that they are applied
+        * to the new data. Note that we must call dmu_recv_stream() if
+        * dmu_recv_begin() succeeds.
         */
        if (props) {
-               clear_props(tofs, origprops, props);
+               nvlist_t *errlist;
+
+               if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) {
+                       if (drc.drc_newfs) {
+                               if (spa_version(os->os_spa) >=
+                                   SPA_VERSION_RECVD_PROPS)
+                                       first_recvd_props = B_TRUE;
+                       } else if (origprops != NULL) {
+                               if (clear_received_props(os, tofs, origprops,
+                                   first_recvd_props ? NULL : props) != 0)
+                                       zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+                       } else {
+                               zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+                       }
+                       dsl_prop_set_hasrecvd(os);
+               } else if (!drc.drc_newfs) {
+                       zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+               }
+
+               (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+                   props, &errlist);
+               (void) nvlist_merge(errors, errlist, 0);
+               nvlist_free(errlist);
+       }
+
+       if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) {
                /*
-                * XXX - Note, this is all-or-nothing; should be best-effort.
+                * Caller made zc->zc_nvlist_dst less than the minimum expected
+                * size or supplied an invalid address.
                 */
-               (void) zfs_set_prop_nvlist(tofs, props);
+               props_error = EINVAL;
        }
 
        off = fp->f_offset;
@@ -2802,24 +3483,17 @@ zfs_ioc_recv(zfs_cmd_t *zc)
                if (getzfsvfs(tofs, &zfsvfs) == 0) {
                        /* online recv */
                        int end_err;
-                       char *osname;
-                       int mode;
 
-                       osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-                       error = zfs_suspend_fs(zfsvfs, osname, &mode);
+                       error = zfs_suspend_fs(zfsvfs);
                        /*
                         * If the suspend fails, then the recv_end will
                         * likely also fail, and clean up after itself.
                         */
                        end_err = dmu_recv_end(&drc);
-                       if (error == 0) {
-                               int resume_err =
-                                   zfs_resume_fs(zfsvfs, osname, mode);
-                               error = error ? error : resume_err;
-                       }
+                       if (error == 0)
+                               error = zfs_resume_fs(zfsvfs, tofs);
                        error = error ? error : end_err;
                        VFS_RELE(zfsvfs->z_vfs);
-                       kmem_free(osname, MAXNAMELEN);
                } else {
                        error = dmu_recv_end(&drc);
                }
@@ -2829,17 +3503,64 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
 
+#ifdef DEBUG
+       if (zfs_ioc_recv_inject_err) {
+               zfs_ioc_recv_inject_err = B_FALSE;
+               error = 1;
+       }
+#endif
        /*
         * On error, restore the original props.
         */
        if (error && props) {
-               clear_props(tofs, props, NULL);
-               (void) zfs_set_prop_nvlist(tofs, origprops);
+               if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
+                       if (clear_received_props(os, tofs, props, NULL) != 0) {
+                               /*
+                                * We failed to clear the received properties.
+                                * Since we may have left a $recvd value on the
+                                * system, we can't clear the $hasrecvd flag.
+                                */
+                               zc->zc_obj |= ZPROP_ERR_NORESTORE;
+                       } else if (first_recvd_props) {
+                               dsl_prop_unset_hasrecvd(os);
+                       }
+                       dmu_objset_rele(os, FTAG);
+               } else if (!drc.drc_newfs) {
+                       /* We failed to clear the received properties. */
+                       zc->zc_obj |= ZPROP_ERR_NORESTORE;
+               }
+
+               if (origprops == NULL && !drc.drc_newfs) {
+                       /* We failed to stash the original properties. */
+                       zc->zc_obj |= ZPROP_ERR_NORESTORE;
+               }
+
+               /*
+                * dsl_props_set() will not convert RECEIVED to LOCAL on or
+                * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+                * explictly if we're restoring local properties cleared in the
+                * first new-style receive.
+                */
+               if (origprops != NULL &&
+                   zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+                   ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+                   origprops, NULL) != 0) {
+                       /*
+                        * We stashed the original properties but failed to
+                        * restore them.
+                        */
+                       zc->zc_obj |= ZPROP_ERR_NORESTORE;
+               }
        }
 out:
        nvlist_free(props);
        nvlist_free(origprops);
+       nvlist_free(errors);
        releasef(fd);
+
+       if (error == 0)
+               error = props_error;
+
        return (error);
 }
 
@@ -2861,8 +3582,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
        int error;
        offset_t off;
 
-       error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &tosnap);
+       error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
        if (error)
                return (error);
 
@@ -2876,20 +3596,19 @@ zfs_ioc_send(zfs_cmd_t *zc)
                if (cp)
                        *(cp+1) = 0;
                (void) strncat(buf, zc->zc_value, MAXPATHLEN);
-               error = dmu_objset_open(buf, DMU_OST_ANY,
-                   DS_MODE_USER | DS_MODE_READONLY, &fromsnap);
+               error = dmu_objset_hold(buf, FTAG, &fromsnap);
                kmem_free(buf, MAXPATHLEN);
                if (error) {
-                       dmu_objset_close(tosnap);
+                       dmu_objset_rele(tosnap, FTAG);
                        return (error);
                }
        }
 
        fp = getf(zc->zc_cookie);
        if (fp == NULL) {
-               dmu_objset_close(tosnap);
+               dmu_objset_rele(tosnap, FTAG);
                if (fromsnap)
-                       dmu_objset_close(fromsnap);
+                       dmu_objset_rele(fromsnap, FTAG);
                return (EBADF);
        }
 
@@ -2900,8 +3619,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
                fp->f_offset = off;
        releasef(zc->zc_cookie);
        if (fromsnap)
-               dmu_objset_close(fromsnap);
-       dmu_objset_close(tosnap);
+               dmu_objset_rele(fromsnap, FTAG);
+       dmu_objset_rele(tosnap, FTAG);
        return (error);
 }
 
@@ -2977,16 +3696,38 @@ zfs_ioc_clear(zfs_cmd_t *zc)
                mutex_exit(&spa_namespace_lock);
                return (EIO);
        }
-       if (spa->spa_log_state == SPA_LOG_MISSING) {
+       if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
                /* we need to let spa_open/spa_load clear the chains */
-               spa->spa_log_state = SPA_LOG_CLEAR;
+               spa_set_log_state(spa, SPA_LOG_CLEAR);
        }
+       spa->spa_last_open_failed = 0;
        mutex_exit(&spa_namespace_lock);
 
-       if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+       if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+               error = spa_open(zc->zc_name, &spa, FTAG);
+       } else {
+               nvlist_t *policy;
+               nvlist_t *config = NULL;
+
+               if (zc->zc_nvlist_src == NULL)
+                       return (EINVAL);
+
+               if ((error = get_nvlist(zc->zc_nvlist_src,
+                   zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+                       error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+                           policy, &config);
+                       if (config != NULL) {
+                               (void) put_nvlist(zc, config);
+                               nvlist_free(config);
+                       }
+                       nvlist_free(policy);
+               }
+       }
+
+       if (error)
                return (error);
 
-       spa_vdev_state_enter(spa);
+       spa_vdev_state_enter(spa, SCL_NONE);
 
        if (zc->zc_guid == 0) {
                vd = NULL;
@@ -3019,7 +3760,8 @@ zfs_ioc_clear(zfs_cmd_t *zc)
  * zc_name     name of filesystem
  * zc_value    name of origin snapshot
  *
- * outputs:    none
+ * outputs:
+ * zc_string   name of conflicting snapshot, if there is one
  */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
@@ -3035,7 +3777,7 @@ zfs_ioc_promote(zfs_cmd_t *zc)
                *cp = '\0';
        (void) dmu_objset_find(zc->zc_value,
            zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
-       return (dsl_dataset_promote(zc->zc_name));
+       return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
 /*
@@ -3059,7 +3801,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
        if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
                return (EINVAL);
 
-       error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+       error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
        if (error)
                return (error);
 
@@ -3085,13 +3827,15 @@ static int
 zfs_ioc_userspace_many(zfs_cmd_t *zc)
 {
        zfsvfs_t *zfsvfs;
-       int error;
+       int bufsize = zc->zc_nvlist_dst_size;
+
+       if (bufsize <= 0)
+               return (ENOMEM);
 
-       error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+       int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
        if (error)
                return (error);
 
-       int bufsize = zc->zc_nvlist_dst_size;
        void *buf = kmem_alloc(bufsize, KM_SLEEP);
 
        error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
@@ -3119,34 +3863,31 @@ static int
 zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 {
        objset_t *os;
-       int error;
+       int error = 0;
        zfsvfs_t *zfsvfs;
 
        if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
-               if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) {
+               if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
                        /*
                         * If userused is not enabled, it may be because the
                         * objset needs to be closed & reopened (to grow the
                         * objset_phys_t).  Suspend/resume the fs will do that.
                         */
-                       int mode;
-                       error = zfs_suspend_fs(zfsvfs, NULL, &mode);
-                       if (error == 0) {
-                               error = zfs_resume_fs(zfsvfs,
-                                   zc->zc_name, mode);
-                       }
+                       error = zfs_suspend_fs(zfsvfs);
+                       if (error == 0)
+                               error = zfs_resume_fs(zfsvfs, zc->zc_name);
                }
                if (error == 0)
                        error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
                VFS_RELE(zfsvfs->z_vfs);
        } else {
-               error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-                   DS_MODE_USER, &os);
+               /* XXX kind of reading contents without owning */
+               error = dmu_objset_hold(zc->zc_name, FTAG, &os);
                if (error)
                        return (error);
 
                error = dmu_objset_userspace_upgrade(os);
-               dmu_objset_close(os);
+               dmu_objset_rele(os, FTAG);
        }
 
        return (error);
@@ -3414,6 +4155,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
                        VN_RELE(vp);
                        VN_RELE(ZTOV(sharedir));
                        ZFS_EXIT(zfsvfs);
+                       nvlist_free(nvlist);
                        return (error);
                }
                error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
@@ -3444,6 +4186,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
  * zc_value    short name of snap
  * zc_string   user-supplied tag for this reference
  * zc_cookie   recursive flag
+ * zc_temphold set if hold is temporary
  *
  * outputs:            none
  */
@@ -3456,7 +4199,7 @@ zfs_ioc_hold(zfs_cmd_t *zc)
                return (EINVAL);
 
        return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
-           zc->zc_string, recursive));
+           zc->zc_string, recursive, zc->zc_temphold));
 }
 
 /*
@@ -3521,7 +4264,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
            B_FALSE },
        { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
            B_FALSE },
-       { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+       { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
            B_TRUE },
        { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
            B_FALSE },
@@ -3544,18 +4287,14 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
        { zfs_ioc_vdev_setfru,  zfs_secpolicy_config, POOL_NAME, B_FALSE,
            B_TRUE },
        { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_FALSE },
+           B_TRUE },
        { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
            B_FALSE },
        { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_FALSE },
+           B_TRUE },
        { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_FALSE },
+           B_TRUE },
        { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
-       { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
-           B_FALSE },
-       { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
-           B_FALSE },
        { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
        { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
            B_TRUE},
@@ -3575,8 +4314,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
        { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
        { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
            B_TRUE },
-       { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
-           B_TRUE },
+       { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
+           B_TRUE, B_TRUE },
        { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
            B_TRUE },
        { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
@@ -3591,8 +4330,6 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
            B_TRUE },
        { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
            B_FALSE },
-       { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
-           B_FALSE },
        { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
        { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
            B_TRUE },
@@ -3608,6 +4345,10 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
        { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
            B_TRUE },
        { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+           B_TRUE },
+       { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+           B_FALSE },
+       { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
            B_TRUE }
 };
 
@@ -3647,8 +4388,10 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
        zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 
        error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
+       if (error != 0)
+               error = EFAULT;
 
-       if (error == 0)
+       if ((error == 0) && !(flag & FKIOCTL))
                error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr);
 
        /*
@@ -3685,7 +4428,8 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 
        rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
        if (error == 0) {
-               error = rc;
+               if (rc != 0)
+                       error = EFAULT;
                if (zfs_ioc_vec[vec].zvec_his_log)
                        zfs_log_history(zc);
        }
index 3f0b6b0..bf9f37b 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/ddi.h>
 #include <sys/dsl_dataset.h>
 
-#define        ZFS_HANDLE_REPLAY(zilog, tx) \
-       if (zilog->zl_replay) { \
-               dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
-               zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
-                   zilog->zl_replaying_seq; \
-               return; \
-       }
-
 /*
  * These zfs_log_* functions must be called within a dmu tx, in one
  * of 2 contexts depending on zilog->z_replay:
@@ -175,6 +167,9 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
                ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
        if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
                bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+       if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+               *attrs |= (xoap->xoa_reparse == 0) ? 0 :
+                   XAT0_REPARSE;
 }
 
 static void *
@@ -248,11 +243,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        size_t namesize = strlen(name) + 1;
        size_t fuidsz = 0;
 
-       if (zilog == NULL)
+       if (zil_replaying(zilog, tx))
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        /*
         * If we have FUIDs present then add in space for
         * domains and ACE fuid's if any.
@@ -283,21 +276,25 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        lr = (lr_create_t *)&itx->itx_lr;
        lr->lr_doid = dzp->z_id;
        lr->lr_foid = zp->z_id;
-       lr->lr_mode = zp->z_phys->zp_mode;
-       if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
-               lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+       lr->lr_mode = zp->z_mode;
+       if (!IS_EPHEMERAL(zp->z_uid)) {
+               lr->lr_uid = (uint64_t)zp->z_uid;
        } else {
                lr->lr_uid = fuidp->z_fuid_owner;
        }
-       if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
-               lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+       if (!IS_EPHEMERAL(zp->z_gid)) {
+               lr->lr_gid = (uint64_t)zp->z_gid;
        } else {
                lr->lr_gid = fuidp->z_fuid_group;
        }
-       lr->lr_gen = zp->z_phys->zp_gen;
-       lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
-       lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
-       lr->lr_rdev = zp->z_phys->zp_rdev;
+       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+           sizeof (uint64_t));
+       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+           lr->lr_crtime, sizeof (uint64_t) * 2);
+
+       if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
+           sizeof (lr->lr_rdev)) != 0)
+               lr->lr_rdev = 0;
 
        /*
         * Fill in xvattr info if any
@@ -353,11 +350,9 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        lr_remove_t *lr;
        size_t namesize = strlen(name) + 1;
 
-       if (zilog == NULL)
+       if (zil_replaying(zilog, tx))
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
        lr = (lr_remove_t *)&itx->itx_lr;
        lr->lr_doid = dzp->z_id;
@@ -379,11 +374,9 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        lr_link_t *lr;
        size_t namesize = strlen(name) + 1;
 
-       if (zilog == NULL)
+       if (zil_replaying(zilog, tx))
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
        lr = (lr_link_t *)&itx->itx_lr;
        lr->lr_doid = dzp->z_id;
@@ -408,21 +401,20 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        size_t namesize = strlen(name) + 1;
        size_t linksize = strlen(link) + 1;
 
-       if (zilog == NULL)
+       if (zil_replaying(zilog, tx))
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
        lr = (lr_create_t *)&itx->itx_lr;
        lr->lr_doid = dzp->z_id;
        lr->lr_foid = zp->z_id;
-       lr->lr_mode = zp->z_phys->zp_mode;
-       lr->lr_uid = zp->z_phys->zp_uid;
-       lr->lr_gid = zp->z_phys->zp_gid;
-       lr->lr_gen = zp->z_phys->zp_gen;
-       lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
-       lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+       lr->lr_uid = zp->z_uid;
+       lr->lr_gid = zp->z_gid;
+       lr->lr_mode = zp->z_mode;
+       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+           sizeof (uint64_t));
+       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+           lr->lr_crtime, sizeof (uint64_t) * 2);
        bcopy(name, (char *)(lr + 1), namesize);
        bcopy(link, (char *)(lr + 1) + namesize, linksize);
 
@@ -444,11 +436,9 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        size_t snamesize = strlen(sname) + 1;
        size_t dnamesize = strlen(dname) + 1;
 
-       if (zilog == NULL)
+       if (zil_replaying(zilog, tx))
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
        lr = (lr_rename_t *)&itx->itx_lr;
        lr->lr_sdoid = sdzp->z_id;
@@ -474,14 +464,17 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
        itx_wr_state_t write_state;
        boolean_t slogging;
        uintptr_t fsync_cnt;
+       ssize_t immediate_write_sz;
 
-       if (zilog == NULL || zp->z_unlinked)
+       if (zil_replaying(zilog, tx) || zp->z_unlinked)
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+       immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+           ? 0 : zfs_immediate_write_sz;
 
-       slogging = spa_has_slogs(zilog->zl_spa);
-       if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
+       slogging = spa_has_slogs(zilog->zl_spa) &&
+           (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+       if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
                write_state = WR_INDIRECT;
        else if (ioflag & (FSYNC | FDSYNC))
                write_state = WR_COPIED;
@@ -510,8 +503,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
                lr = (lr_write_t *)&itx->itx_lr;
                if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
                    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
-                       kmem_free(itx, offsetof(itx_t, itx_lr) +
-                           itx->itx_lr.lrc_reclen);
+                       zil_itx_destroy(itx);
                        itx = zil_itx_create(txtype, sizeof (*lr));
                        lr = (lr_write_t *)&itx->itx_lr;
                        write_state = WR_NEED_COPY;
@@ -552,11 +544,9 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
        uint64_t seq;
        lr_truncate_t *lr;
 
-       if (zilog == NULL || zp->z_unlinked)
+       if (zil_replaying(zilog, tx) || zp->z_unlinked)
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        itx = zil_itx_create(txtype, sizeof (*lr));
        lr = (lr_truncate_t *)&itx->itx_lr;
        lr->lr_foid = zp->z_id;
@@ -582,12 +572,9 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
        size_t          recsize = sizeof (lr_setattr_t);
        void            *start;
 
-
-       if (zilog == NULL || zp->z_unlinked)
+       if (zil_replaying(zilog, tx) || zp->z_unlinked)
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        /*
         * If XVATTR set, then log record size needs to allow
         * for lr_attr_t + xvattr mask, mapsize and create time
@@ -651,11 +638,9 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
        size_t txsize;
        size_t aclbytes = vsecp->vsa_aclentsz;
 
-       if (zilog == NULL || zp->z_unlinked)
+       if (zil_replaying(zilog, tx) || zp->z_unlinked)
                return;
 
-       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
        txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
            TX_ACL_V0 : TX_ACL;
 
index 85b7970..f26009b 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -129,6 +127,8 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
                ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
        if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
                bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+       if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+               xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
 }
 
 static int
@@ -275,9 +275,9 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
        uint64_t txtype;
        int error;
 
+       txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
        if (byteswap) {
                byteswap_uint64_array(lracl, sizeof (*lracl));
-               txtype = (int)lr->lr_common.lrc_txtype;
                if (txtype == TX_CREATE_ACL_ATTR ||
                    txtype == TX_MKDIR_ACL_ATTR) {
                        lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -318,7 +318,7 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 
        if (lr->lr_common.lrc_txtype & TX_CI)
                vflg |= FIGNORECASE;
-       switch ((int)lr->lr_common.lrc_txtype) {
+       switch (txtype) {
        case TX_CREATE_ACL:
                aclstart = (caddr_t)(lracl + 1);
                fuidstart = (caddr_t)aclstart +
@@ -391,7 +391,8 @@ bail:
 
        VN_RELE(ZTOV(dzp));
 
-       zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+       if (zfsvfs->z_fuid_replay)
+               zfs_fuid_info_free(zfsvfs->z_fuid_replay);
        zfsvfs->z_fuid_replay = NULL;
 
        return (error);
@@ -413,9 +414,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
        uint64_t txtype;
        int error;
 
+       txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
        if (byteswap) {
                byteswap_uint64_array(lr, sizeof (*lr));
-               txtype = (int)lr->lr_common.lrc_txtype;
                if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
                        zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
        }
@@ -460,7 +461,7 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
                    lr->lr_uid, lr->lr_gid);
        }
 
-       switch ((int)lr->lr_common.lrc_txtype) {
+       switch (txtype) {
        case TX_CREATE_ATTR:
                lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
                xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
@@ -498,7 +499,6 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
                    &vp, kcred, NULL, vflg, NULL);
                break;
        case TX_MKXATTR:
-               name = (char *)(lr + 1);
                error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
                break;
        case TX_SYMLINK:
@@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
        znode_t *zp;
        int error;
        ssize_t resid;
+       uint64_t orig_eof, eod, offset, length;
 
        if (byteswap)
                byteswap_uint64_array(lr, sizeof (*lr));
@@ -640,8 +641,64 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
                return (error);
        }
 
-       error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
-           lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+       offset = lr->lr_offset;
+       length = lr->lr_length;
+       eod = offset + length;          /* end of data for this write */
+
+       orig_eof = zp->z_size;
+
+       /* If it's a dmu_sync() block, write the whole block */
+       if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+               uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+               if (length < blocksize) {
+                       offset -= offset % blocksize;
+                       length = blocksize;
+               }
+       }
+
+       error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+           UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+       /*
+        * This may be a write from a dmu_sync() for a whole block,
+        * and may extend beyond the current end of the file.
+        * We can't just replay what was written for this TX_WRITE as
+        * a future TX_WRITE2 may extend the eof and the data for that
+        * write needs to be there. So we write the whole block and
+        * reduce the eof.
+        */
+       if (orig_eof < zp->z_size) /* file length grew ? */
+               zp->z_size = eod;
+
+       VN_RELE(ZTOV(zp));
+
+       return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+       znode_t *zp;
+       int error;
+       uint64_t end;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+               return (error);
+
+       end = lr->lr_offset + lr->lr_length;
+       if (end > zp->z_size) {
+               ASSERT3U(end - zp->z_size, <, zp->z_blksz);
+               zp->z_size = end;
+       }
 
        VN_RELE(ZTOV(zp));
 
@@ -658,16 +715,8 @@ zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
        if (byteswap)
                byteswap_uint64_array(lr, sizeof (*lr));
 
-       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-               /*
-                * As we can log truncates out of order, it's possible the
-                * file has been removed. In this case just drop the truncate
-                * and return success.
-                */
-               if (error == ENOENT)
-                       error = 0;
+       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
                return (error);
-       }
 
        bzero(&fl, sizeof (fl));
        fl.l_type = F_WRLCK;
@@ -701,16 +750,8 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
                        zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
        }
 
-       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-               /*
-                * As we can log setattrs out of order, it's possible the
-                * file has been removed. In this case just drop the setattr
-                * and return success.
-                */
-               if (error == ENOENT)
-                       error = 0;
+       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
                return (error);
-       }
 
        zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
            lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
@@ -756,16 +797,8 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
                zfs_oldace_byteswap(ace, lr->lr_aclcnt);
        }
 
-       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-               /*
-                * As we can log acls out of order, it's possible the
-                * file has been removed. In this case just drop the acl
-                * and return success.
-                */
-               if (error == ENOENT)
-                       error = 0;
+       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
                return (error);
-       }
 
        bzero(&vsa, sizeof (vsa));
        vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
@@ -813,16 +846,8 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
                }
        }
 
-       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-               /*
-                * As we can log acls out of order, it's possible the
-                * file has been removed. In this case just drop the acl
-                * and return success.
-                */
-               if (error == ENOENT)
-                       error = 0;
+       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
                return (error);
-       }
 
        bzero(&vsa, sizeof (vsa));
        vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
@@ -875,4 +900,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
        zfs_replay_create_acl,  /* TX_MKDIR_ACL */
        zfs_replay_create,      /* TX_MKDIR_ATTR */
        zfs_replay_create_acl,  /* TX_MKDIR_ACL_ATTR */
+       zfs_replay_write2,      /* TX_WRITE2 */
 };
index 4de8d8a..7fd8f60 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -112,7 +112,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
                 * Range locking is also used by zvol and uses a
                 * dummied up znode. However, for zvol, we don't need to
                 * append or grow blocksize, and besides we don't have
-                * a z_phys or z_zfsvfs - so skip that processing.
+                * a "sa" data or z_zfsvfs - so skip that processing.
                 *
                 * Yes, this is ugly, and would be solved by not handling
                 * grow or append in range lock code. If that was done then
@@ -125,14 +125,14 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
                         * This is done under z_range_lock to avoid races.
                         */
                        if (new->r_type == RL_APPEND)
-                               new->r_off = zp->z_phys->zp_size;
+                               new->r_off = zp->z_size;
 
                        /*
                         * If we need to grow the block size then grab the whole
                         * file range. This is also done under z_range_lock to
                         * avoid races.
                         */
-                       end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+                       end_size = MAX(zp->z_size, new->r_off + len);
                        if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
                            zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
                                new->r_off = 0;
diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c
new file mode 100644 (file)
index 0000000..73a40aa
--- /dev/null
@@ -0,0 +1,312 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively.  The file system
+ * could have other attributes stored in files, but they will be
+ * ignored.  The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+       {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+       {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+       {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+       {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+       {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+       {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+       {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+       {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+       {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+       {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+       {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+       {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+       {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+       {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+       {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+       {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+       {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+       {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+       {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+       {"ZPL_DACL_ACES", 0, SA_ACL, 0},
+       {NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+
+int
+zfs_sa_readlink(znode_t *zp, uio_t *uio)
+{
+       dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+       size_t bufsz;
+       int error;
+
+       bufsz = zp->z_size;
+       if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+               error = uiomove((caddr_t)db->db_data +
+                   ZFS_OLD_ZNODE_PHYS_SIZE,
+                   MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+       } else {
+               dmu_buf_t *dbp;
+               if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
+                   0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+                       error = uiomove(dbp->db_data,
+                           MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+                       dmu_buf_rele(dbp, FTAG);
+               }
+       }
+       return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+       dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+       if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+               VERIFY(dmu_set_bonus(db,
+                   len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+               if (len) {
+                       bcopy(link, (caddr_t)db->db_data +
+                           ZFS_OLD_ZNODE_PHYS_SIZE, len);
+               }
+       } else {
+               dmu_buf_t *dbp;
+
+               zfs_grow_blocksize(zp, len, tx);
+               VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
+                   zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+
+               dmu_buf_will_dirty(dbp, tx);
+
+               ASSERT3U(len, <=, dbp->db_size);
+               bcopy(link, dbp->db_data, len);
+               dmu_buf_rele(dbp, FTAG);
+       }
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+       xoptattr_t *xoap;
+
+       VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+       if (zp->z_is_sa) {
+               if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+                   &xoap->xoa_av_scanstamp,
+                   sizeof (xoap->xoa_av_scanstamp)) != 0)
+                       return;
+       } else {
+               dmu_object_info_t doi;
+               dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+               int len;
+
+               if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+                       return;
+
+               sa_object_info(zp->z_sa_hdl, &doi);
+               len = sizeof (xoap->xoa_av_scanstamp) +
+                   ZFS_OLD_ZNODE_PHYS_SIZE;
+
+               if (len <= doi.doi_bonus_size) {
+                       (void) memcpy(xoap->xoa_av_scanstamp,
+                           (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+                           sizeof (xoap->xoa_av_scanstamp));
+               }
+       }
+       XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+       xoptattr_t *xoap;
+
+       VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+       if (zp->z_is_sa)
+               VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+                   &xoap->xoa_av_scanstamp,
+                   sizeof (xoap->xoa_av_scanstamp), tx));
+       else {
+               dmu_object_info_t doi;
+               dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+               int len;
+
+               sa_object_info(zp->z_sa_hdl, &doi);
+               len = sizeof (xoap->xoa_av_scanstamp) +
+                   ZFS_OLD_ZNODE_PHYS_SIZE;
+               if (len > doi.doi_bonus_size)
+                       VERIFY(dmu_set_bonus(db, len, tx) == 0);
+               (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+                   xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+               zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+               VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+                   &zp->z_pflags, sizeof (uint64_t), tx));
+       }
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to know performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+       dmu_buf_t *db = sa_get_db(hdl);
+       znode_t *zp = sa_get_userdata(hdl);
+       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+       sa_bulk_attr_t bulk[20];
+       int count = 0;
+       sa_bulk_attr_t sa_attrs[20] = { 0 };
+       zfs_acl_locator_cb_t locate = { 0 };
+       uint64_t uid, gid, mode, rdev, xattr, parent;
+       uint64_t crtime[2], mtime[2], ctime[2];
+       zfs_acl_phys_t znode_acl;
+       char scanstamp[AV_SCANSTAMP_SZ];
+
+       /*
+        * No upgrade if ACL isn't cached
+        * since we won't know which locks are held
+        * and ready the ACL would require special "locked"
+        * interfaces that would be messy
+        */
+       if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
+               return;
+
+       /* First do a bulk query of the attributes that aren't cached */
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+           &znode_acl, 88);
+
+       if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+               return;
+
+
+       /*
+        * While the order here doesn't matter its best to try and organize
+        * it is such a way to pick up an already existing layout number
+        */
+       count = 0;
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+           &zp->z_size, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+           NULL, &zp->z_gen, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+           NULL, &parent, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+           zp->z_atime, 16);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+           &mtime, 16);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+           &ctime, 16);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+           &crtime, 16);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+           &zp->z_links, 8);
+       if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
+               SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+                   &rdev, 8);
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+           &zp->z_acl_cached->z_acl_count, 8);
+
+       if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+               zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+       locate.cb_aclp = zp->z_acl_cached;
+       SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+           zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+       if (xattr)
+               SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs),
+                   NULL, &rdev, 8);
+
+       /* if scanstamp then add scanstamp */
+
+       if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+               bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+                   scanstamp, AV_SCANSTAMP_SZ);
+               SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+                   NULL, scanstamp, AV_SCANSTAMP_SZ);
+               zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+       }
+
+       VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+       VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+           count, tx) == 0);
+       if (znode_acl.z_acl_extern_obj)
+               VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+                   znode_acl.z_acl_extern_obj, tx));
+
+       zp->z_is_sa = B_TRUE;
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+       if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
+               return;
+
+       ASSERT(!zp->z_is_sa);
+
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+       if (ZFS_EXTERNAL_ACL(zp)) {
+               dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+                   DMU_OBJECT_END);
+       }
+}
+
+#endif
index d03f92b..f68dde8 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -46,6 +47,7 @@
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
+#include <sys/sa.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
@@ -60,6 +62,8 @@
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
+#include <sys/sa.h>
+#include "zfs_comutil.h"
 
 int zfsfstype;
 vfsops_t *zfs_vfsops = NULL;
@@ -163,8 +167,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 
                if (zfsvfs->z_log != NULL)
                        zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
-               else
-                       txg_wait_synced(dp, 0);
+
                ZFS_EXIT(zfsvfs);
        } else {
                /*
@@ -381,14 +384,6 @@ vscan_changed_cb(void *arg, uint64_t newval)
 }
 
 static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
-       zfsvfs_t *zfsvfs = arg;
-
-       zfsvfs->z_acl_mode = newval;
-}
-
-static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
        zfsvfs_t *zfsvfs = arg;
@@ -518,8 +513,6 @@ zfs_register_callbacks(vfs_t *vfsp)
        error = error ? error : dsl_prop_register(ds,
            "snapdir", snapdir_changed_cb, zfsvfs);
        error = error ? error : dsl_prop_register(ds,
-           "aclmode", acl_mode_changed_cb, zfsvfs);
-       error = error ? error : dsl_prop_register(ds,
            "aclinherit", acl_inherit_changed_cb, zfsvfs);
        error = error ? error : dsl_prop_register(ds,
            "vscan", vscan_changed_cb, zfsvfs);
@@ -560,7 +553,6 @@ unregister:
        (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
        (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
        (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
-       (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
        (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
            zfsvfs);
        (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
@@ -568,69 +560,59 @@ unregister:
 
 }
 
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
-    int64_t delta, dmu_tx_t *tx)
-{
-       uint64_t used = 0;
-       char buf[32];
-       int err;
-       uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-
-       if (delta == 0)
-               return;
-
-       (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
-       err = zap_lookup(os, obj, buf, 8, 1, &used);
-       ASSERT(err == 0 || err == ENOENT);
-       /* no underflow/overflow */
-       ASSERT(delta > 0 || used >= -delta);
-       ASSERT(delta < 0 || used + delta > used);
-       used += delta;
-       if (used == 0)
-               err = zap_remove(os, obj, buf, tx);
-       else
-               err = zap_update(os, obj, buf, 8, 1, &used, tx);
-       ASSERT(err == 0);
-}
-
-static void
-zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
-    void *oldbonus, void *newbonus,
-    uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+    uint64_t *userp, uint64_t *groupp)
 {
-       znode_phys_t *oldznp = oldbonus;
-       znode_phys_t *newznp = newbonus;
+       znode_phys_t *znp = data;
+       int error = 0;
 
-       if (bonustype != DMU_OT_ZNODE)
-               return;
+       /*
+        * Is it a valid type of object to track?
+        */
+       if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+               return (ENOENT);
 
-       /* We charge 512 for the dnode (if it's allocated). */
-       if (oldznp->zp_gen != 0)
-               oldused += DNODE_SIZE;
-       if (newznp->zp_gen != 0)
-               newused += DNODE_SIZE;
+       /*
+        * If we have a NULL data pointer
+        * then assume the id's aren't changing and
+        * return EEXIST to the dmu to let it know to
+        * use the same ids
+        */
+       if (data == NULL)
+               return (EEXIST);
 
-       if (oldznp->zp_uid == newznp->zp_uid) {
-               uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
+       if (bonustype == DMU_OT_ZNODE) {
+               *userp = znp->zp_uid;
+               *groupp = znp->zp_gid;
        } else {
-               uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
-               uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
-       }
+               int hdrsize;
 
-       if (oldznp->zp_gid == newznp->zp_gid) {
-               uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
-       } else {
-               uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
-               uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
+               ASSERT(bonustype == DMU_OT_SA);
+               hdrsize = sa_hdrsize(data);
+
+               if (hdrsize != 0) {
+                       *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+                           SA_UID_OFFSET));
+                       *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+                           SA_GID_OFFSET));
+               } else {
+                       /*
+                        * This should only happen for newly created
+                        * files that haven't had the znode data filled
+                        * in yet.
+                        */
+                       *userp = 0;
+                       *groupp = 0;
+               }
        }
+       return (error);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
-       extern uint64_t strtonum(const char *str, char **nptr);
        uint64_t fuid;
        const char *domain;
 
@@ -811,7 +793,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 }
 
 boolean_t
-zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
        char buf[32];
        uint64_t used, quota, usedobj, quotaobj;
@@ -834,33 +816,57 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
        return (used >= quota);
 }
 
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
+{
+       uint64_t fuid;
+       uint64_t quotaobj;
+       uid_t id;
+
+       quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+       id = isgroup ? zp->z_gid : zp->z_uid;
+
+       if (quotaobj == 0 || zfsvfs->z_replay)
+               return (B_FALSE);
+
+       if (IS_EPHEMERAL(id)) {
+               VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+                   isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs),
+                   &fuid, sizeof (fuid)));
+       } else {
+               fuid = (uint64_t)id;
+       }
+
+       return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
 int
-zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
        objset_t *os;
        zfsvfs_t *zfsvfs;
        uint64_t zval;
        int i, error;
+       uint64_t sa_obj;
 
-       if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
-               return (error);
-       if (zval)
-               mode |= DS_MODE_READONLY;
+       zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
-       error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
-       if (error == EROFS) {
-               mode |= DS_MODE_READONLY;
-               error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
-       }
-       if (error)
+       /*
+        * We claim to always be readonly so we can open snapshots;
+        * other ZPL code will prevent us from writing to snapshots.
+        */
+       error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+       if (error) {
+               kmem_free(zfsvfs, sizeof (zfsvfs_t));
                return (error);
+       }
 
        /*
         * Initialize the zfs-specific filesystem structure.
         * Should probably make this a kmem cache, shuffle fields,
         * and just bzero up to z_hold_mtx[].
         */
-       zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
        zfsvfs->z_vfs = NULL;
        zfsvfs->z_parent = zfsvfs;
        zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
@@ -870,15 +876,15 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
        error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
        if (error) {
                goto out;
-       } else if (zfsvfs->z_version > ZPL_VERSION) {
-               (void) printf("Mismatched versions:  File system "
-                   "is version %llu on-disk format, which is "
-                   "incompatible with this software version %lld!",
-                   (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+       } else if (zfsvfs->z_version >
+           zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+               (void) printf("Can't mount a version %lld file system "
+                   "on a version %lld pool\n. Pool must be upgraded to mount "
+                   "this file system.", (u_longlong_t)zfsvfs->z_version,
+                   (u_longlong_t)spa_version(dmu_objset_spa(os)));
                error = ENOTSUP;
                goto out;
        }
-
        if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
                goto out;
        zfsvfs->z_norm = (int)zval;
@@ -900,6 +906,26 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
                zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
        zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+       zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+       if (zfsvfs->z_use_sa) {
+               /* should either have both of these objects or none */
+               error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+                   &sa_obj);
+               if (error)
+                       return (error);
+       } else {
+               /*
+                * Pre SA versions file systems should never touch
+                * either the attribute registration or layout objects.
+                */
+               sa_obj = 0;
+       }
+
+       zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+
+       if (zfsvfs->z_version >= ZPL_VERSION_SA)
+               sa_register_update_callback(os, zfs_sa_upgrade);
 
        error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
            &zfsvfs->z_root);
@@ -944,12 +970,12 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-       *zvp = zfsvfs;
+       *zfvp = zfsvfs;
        return (0);
 
 out:
-       dmu_objset_close(os);
-       *zvp = NULL;
+       dmu_objset_disown(os, zfsvfs);
+       *zfvp = NULL;
        kmem_free(zfsvfs, sizeof (zfsvfs_t));
        return (error);
 }
@@ -966,15 +992,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
        /*
         * Set the objset user_ptr to track its zfsvfs.
         */
-       mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+       mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
        dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-       mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+       mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
        zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-       if (zil_disable) {
-               zil_destroy(zfsvfs->z_log, 0);
-               zfsvfs->z_log = NULL;
-       }
 
        /*
         * If we are not mounting (ie: online recv), then we don't
@@ -994,34 +1016,36 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
                else
                        zfs_unlinked_drain(zfsvfs);
 
-               if (zfsvfs->z_log) {
-                       /*
-                        * Parse and replay the intent log.
-                        *
-                        * Because of ziltest, this must be done after
-                        * zfs_unlinked_drain().  (Further note: ziltest
-                        * doesn't use readonly mounts, where
-                        * zfs_unlinked_drain() isn't called.)  This is because
-                        * ziltest causes spa_sync() to think it's committed,
-                        * but actually it is not, so the intent log contains
-                        * many txg's worth of changes.
-                        *
-                        * In particular, if object N is in the unlinked set in
-                        * the last txg to actually sync, then it could be
-                        * actually freed in a later txg and then reallocated
-                        * in a yet later txg.  This would write a "create
-                        * object N" record to the intent log.  Normally, this
-                        * would be fine because the spa_sync() would have
-                        * written out the fact that object N is free, before
-                        * we could write the "create object N" intent log
-                        * record.
-                        *
-                        * But when we are in ziltest mode, we advance the "open
-                        * txg" without actually spa_sync()-ing the changes to
-                        * disk.  So we would see that object N is still
-                        * allocated and in the unlinked set, and there is an
-                        * intent log record saying to allocate it.
-                        */
+               /*
+                * Parse and replay the intent log.
+                *
+                * Because of ziltest, this must be done after
+                * zfs_unlinked_drain().  (Further note: ziltest
+                * doesn't use readonly mounts, where
+                * zfs_unlinked_drain() isn't called.)  This is because
+                * ziltest causes spa_sync() to think it's committed,
+                * but actually it is not, so the intent log contains
+                * many txg's worth of changes.
+                *
+                * In particular, if object N is in the unlinked set in
+                * the last txg to actually sync, then it could be
+                * actually freed in a later txg and then reallocated
+                * in a yet later txg.  This would write a "create
+                * object N" record to the intent log.  Normally, this
+                * would be fine because the spa_sync() would have
+                * written out the fact that object N is free, before
+                * we could write the "create object N" intent log
+                * record.
+                *
+                * But when we are in ziltest mode, we advance the "open
+                * txg" without actually spa_sync()-ing the changes to
+                * disk.  So we would see that object N is still
+                * allocated and in the unlinked set, and there is an
+                * intent log record saying to allocate it.
+                */
+               if (zil_replay_disable) {
+                       zil_destroy(zfsvfs->z_log, B_FALSE);
+               } else {
                        zfsvfs->z_replay = B_TRUE;
                        zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
                        zfsvfs->z_replay = B_FALSE;
@@ -1070,7 +1094,9 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
                vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
                vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
                vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+               vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
        }
+       zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
@@ -1084,7 +1110,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
        ASSERT(vfsp);
        ASSERT(osname);
 
-       error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
+       error = zfsvfs_create(osname, &zfsvfs);
        if (error)
                return (error);
        zfsvfs->z_vfs = vfsp;
@@ -1135,6 +1161,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
                vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
                vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
        }
+       vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
        if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
                uint64_t pval;
@@ -1146,9 +1173,9 @@ zfs_domount(vfs_t *vfsp, char *osname)
                xattr_changed_cb(zfsvfs, pval);
                zfsvfs->z_issnap = B_TRUE;
 
-               mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+               mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
                dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-               mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+               mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
        } else {
                error = zfsvfs_setup(zfsvfs, B_TRUE);
        }
@@ -1157,7 +1184,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
                zfsctl_create(zfsvfs);
 out:
        if (error) {
-               dmu_objset_close(zfsvfs->z_os);
+               dmu_objset_disown(zfsvfs->z_os, zfsvfs);
                zfsvfs_free(zfsvfs);
        } else {
                atomic_add_32(&zfs_active_fs_count, 1);
@@ -1201,9 +1228,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
                VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
                    zfsvfs) == 0);
 
-               VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-                   zfsvfs) == 0);
-
                VERIFY(dsl_prop_unregister(ds, "aclinherit",
                    acl_inherit_changed_cb, zfsvfs) == 0);
 
@@ -1267,6 +1291,139 @@ zfs_parse_bootfs(char *bpath, char *outpath)
        return (error);
 }
 
+/*
+ * zfs_check_global_label:
+ *     Check that the hex label string is appropriate for the dataset
+ *     being mounted into the global_zone proper.
+ *
+ *     Return an error if the hex label string is not default or
+ *     admin_low/admin_high.  For admin_low labels, the corresponding
+ *     dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+       if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+               return (0);
+       if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+               return (0);
+       if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+               /* must be readonly */
+               uint64_t rdonly;
+
+               if (dsl_prop_get_integer(dsname,
+                   zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+                       return (EACCES);
+               return (rdonly ? 0 : EACCES);
+       }
+       return (EACCES);
+}
+
+/*
+ * zfs_mount_label_policy:
+ *     Determine whether the mount is allowed according to MAC check.
+ *     by comparing (where appropriate) label of the dataset against
+ *     the label of the zone being mounted into.  If the dataset has
+ *     no label, create one.
+ *
+ *     Returns:
+ *              0 :    access allowed
+ *             >0 :    error code, such as EACCES
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+       int             error, retv;
+       zone_t          *mntzone = NULL;
+       ts_label_t      *mnt_tsl;
+       bslabel_t       *mnt_sl;
+       bslabel_t       ds_sl;
+       char            ds_hexsl[MAXNAMELEN];
+
+       retv = EACCES;                          /* assume the worst */
+
+       /*
+        * Start by getting the dataset label if it exists.
+        */
+       error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+           1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+       if (error)
+               return (EACCES);
+
+       /*
+        * If labeling is NOT enabled, then disallow the mount of datasets
+        * which have a non-default label already.  No other label checks
+        * are needed.
+        */
+       if (!is_system_labeled()) {
+               if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+                       return (0);
+               return (EACCES);
+       }
+
+       /*
+        * Get the label of the mountpoint.  If mounting into the global
+        * zone (i.e. mountpoint is not within an active zone and the
+        * zoned property is off), the label must be default or
+        * admin_low/admin_high only; no other checks are needed.
+        */
+       mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
+       if (mntzone->zone_id == GLOBAL_ZONEID) {
+               uint64_t zoned;
+
+               zone_rele(mntzone);
+
+               if (dsl_prop_get_integer(osname,
+                   zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+                       return (EACCES);
+               if (!zoned)
+                       return (zfs_check_global_label(osname, ds_hexsl));
+               else
+                       /*
+                        * This is the case of a zone dataset being mounted
+                        * initially, before the zone has been fully created;
+                        * allow this mount into global zone.
+                        */
+                       return (0);
+       }
+
+       mnt_tsl = mntzone->zone_slabel;
+       ASSERT(mnt_tsl != NULL);
+       label_hold(mnt_tsl);
+       mnt_sl = label2bslabel(mnt_tsl);
+
+       if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+               /*
+                * The dataset doesn't have a real label, so fabricate one.
+                */
+               char *str = NULL;
+
+               if (l_to_str_internal(mnt_sl, &str) == 0 &&
+                   dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+                   ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
+                       retv = 0;
+               if (str != NULL)
+                       kmem_free(str, strlen(str) + 1);
+       } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+               /*
+                * Now compare labels to complete the MAC check.  If the
+                * labels are equal then allow access.  If the mountpoint
+                * label dominates the dataset label, allow readonly access.
+                * Otherwise, access is denied.
+                */
+               if (blequal(mnt_sl, &ds_sl))
+                       retv = 0;
+               else if (bldominates(mnt_sl, &ds_sl)) {
+                       vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+                       retv = 0;
+               }
+       }
+
+       label_rele(mnt_tsl);
+       zone_rele(mntzone);
+       return (retv);
+}
+
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
@@ -1419,8 +1576,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
         */
        error = secpolicy_fs_mount(cr, mvp, vfsp);
        if (error) {
-               error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
-               if (error == 0) {
+               if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
                        vattr_t         vattr;
 
                        /*
@@ -1430,16 +1586,14 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 
                        vattr.va_mask = AT_UID;
 
-                       if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
+                       if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
                                goto out;
                        }
 
                        if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
                            VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
-                               error = EPERM;
                                goto out;
                        }
-
                        secpolicy_fs_mount_clearopts(cr, vfsp);
                } else {
                        goto out;
@@ -1456,6 +1610,10 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
                goto out;
        }
 
+       error = zfs_mount_label_policy(vfsp, osname);
+       if (error)
+               goto out;
+
        /*
         * When doing a remount, we simply refresh our temporary properties
         * according to those options set in the current VFS options.
@@ -1617,7 +1775,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
        mutex_enter(&zfsvfs->z_znodes_lock);
        for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
            zp = list_next(&zfsvfs->z_all_znodes, zp))
-               if (zp->z_dbuf) {
+               if (zp->z_sa_hdl) {
                        ASSERT(ZTOV(zp)->v_count > 0);
                        zfs_znode_dmu_fini(zp);
                }
@@ -1668,9 +1826,8 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 
        ret = secpolicy_fs_unmount(cr, vfsp);
        if (ret) {
-               ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
-                   ZFS_DELEG_PERM_MOUNT, cr);
-               if (ret)
+               if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
+                   ZFS_DELEG_PERM_MOUNT, cr))
                        return (ret);
        }
 
@@ -1725,14 +1882,14 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
                /*
                 * Unset the objset user_ptr.
                 */
-               mutex_enter(&os->os->os_user_ptr_lock);
+               mutex_enter(&os->os_user_ptr_lock);
                dmu_objset_set_user(os, NULL);
-               mutex_exit(&os->os->os_user_ptr_lock);
+               mutex_exit(&os->os_user_ptr_lock);
 
                /*
                 * Finally release the objset
                 */
-               dmu_objset_close(os);
+               dmu_objset_disown(os, zfsvfs);
        }
 
        /*
@@ -1813,7 +1970,9 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
                ZFS_EXIT(zfsvfs);
                return (err);
        }
-       zp_gen = zp->z_phys->zp_gen & gen_mask;
+       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+           sizeof (uint64_t));
+       zp_gen = zp_gen & gen_mask;
        if (zp_gen == 0)
                zp_gen = 1;
        if (zp->z_unlinked || zp_gen != fid_gen) {
@@ -1835,17 +1994,13 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
  * 'z_teardown_inactive_lock' write held.
  */
 int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
        int error;
 
        if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
                return (error);
-
-       *modep = zfsvfs->z_os->os_mode;
-       if (name)
-               dmu_objset_name(zfsvfs->z_os, name);
-       dmu_objset_close(zfsvfs->z_os);
+       dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 
        return (0);
 }
@@ -1854,18 +2009,30 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
  * Reopen zfsvfs_t::z_os and release VOPs.
  */
 int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
 {
-       int err;
+       int err, err2;
 
        ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
        ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
-       err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+       err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
+           &zfsvfs->z_os);
        if (err) {
                zfsvfs->z_os = NULL;
        } else {
                znode_t *zp;
+               uint64_t sa_obj = 0;
+
+               err2 = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+                   ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+               if ((err || err2) && zfsvfs->z_version >= ZPL_VERSION_SA)
+                       goto bail;
+
+
+               zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj,
+                   zfs_attr_table,  ZPL_END);
 
                VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
@@ -1884,6 +2051,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
 
        }
 
+bail:
        /* release the VOPs */
        rw_exit(&zfsvfs->z_teardown_inactive_lock);
        rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
@@ -1906,9 +2074,11 @@ zfs_freevfs(vfs_t *vfsp)
 
        /*
         * If this is a snapshot, we have an extra VFS_HOLD on our parent
-        * from zfs_mount().  Release it here.
+        * from zfs_mount().  Release it here.  If we came through
+        * zfs_mountroot() instead, we didn't grab an extra hold, so
+        * skip the VFS_RELE for rootvfs.
         */
-       if (zfsvfs->z_issnap)
+       if (zfsvfs->z_issnap && (vfsp != rootvfs))
                VFS_RELE(zfsvfs->z_parent->z_vfs);
 
        zfsvfs_free(zfsvfs);
@@ -2000,13 +2170,23 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
        if (newvers < zfsvfs->z_version)
                return (EINVAL);
 
+       if (zfs_spa_version_map(newvers) >
+           spa_version(dmu_objset_spa(zfsvfs->z_os)))
+               return (ENOTSUP);
+
        tx = dmu_tx_create(os);
        dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+       if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+               dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+                   ZFS_SA_ATTRS);
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+       }
        error = dmu_tx_assign(tx, TXG_WAIT);
        if (error) {
                dmu_tx_abort(tx);
                return (error);
        }
+
        error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
            8, 1, &newvers, tx);
 
@@ -2015,9 +2195,24 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
                return (error);
        }
 
-       spa_history_internal_log(LOG_DS_UPGRADE,
-           dmu_objset_spa(os), tx, CRED(),
-           "oldver=%llu newver=%llu dataset = %llu",
+       if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+               uint64_t sa_obj;
+
+               ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+                   SPA_VERSION_SA);
+               sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+                   DMU_OT_NONE, 0, tx);
+
+               error = zap_add(os, MASTER_NODE_OBJ,
+                   ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+               ASSERT3U(error, ==, 0);
+
+               VERIFY(0 == sa_set_sa_object(os, sa_obj));
+               sa_register_update_callback(os, zfs_sa_upgrade);
+       }
+
+       spa_history_log_internal(LOG_DS_UPGRADE,
+           dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
            zfsvfs->z_version, newvers, dmu_objset_id(os));
 
        dmu_tx_commit(tx);
index 8eb4665..aa43c06 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
+#include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include "fs/fs_subr.h"
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
+#include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
-#include <sys/cred_impl.h>
+#include <sys/cred.h>
 #include <sys/attr.h>
 
 /*
@@ -176,7 +179,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
-       if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
+       if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
            ((flag & FAPPEND) == 0)) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
@@ -184,8 +187,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 
        if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
            ZTOV(zp)->v_type == VREG &&
-           !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
-           zp->z_phys->zp_size > 0) {
+           !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
                if (fs_vscan(*vpp, cr, 0) != 0) {
                        ZFS_EXIT(zfsvfs);
                        return (EACCES);
@@ -223,8 +225,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 
        if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
            ZTOV(zp)->v_type == VREG &&
-           !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
-           zp->z_phys->zp_size > 0)
+           !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
                VERIFY(fs_vscan(vp, cr, 1) == 0);
 
        ZFS_EXIT(zfsvfs);
@@ -244,7 +245,7 @@ zfs_holey(vnode_t *vp, int cmd, offset_t *off)
        int error;
        boolean_t hole;
 
-       file_sz = zp->z_phys->zp_size;
+       file_sz = zp->z_size;
        if (noff >= file_sz)  {
                return (ENXIO);
        }
@@ -447,12 +448,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        ssize_t         n, nbytes;
        int             error;
        rl_t            *rl;
+       xuio_t          *xuio = NULL;
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
        os = zfsvfs->z_os;
 
-       if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
+       if (zp->z_pflags & ZFS_AV_QUARANTINED) {
                ZFS_EXIT(zfsvfs);
                return (EACCES);
        }
@@ -476,7 +478,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        /*
         * Check for mandatory locks
         */
-       if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+       if (MANDMODE(zp->z_mode)) {
                if (error = chklock(vp, FREAD,
                    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
                        ZFS_EXIT(zfsvfs);
@@ -487,7 +489,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        /*
         * If we're in FRSYNC mode, sync out this znode before reading it.
         */
-       if (ioflag & FRSYNC)
+       if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 
        /*
@@ -499,13 +501,42 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
         * If we are reading past end-of-file we can skip
         * to the end; but we might still need to set atime.
         */
-       if (uio->uio_loffset >= zp->z_phys->zp_size) {
+       if (uio->uio_loffset >= zp->z_size) {
                error = 0;
                goto out;
        }
 
-       ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
-       n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+       ASSERT(uio->uio_loffset < zp->z_size);
+       n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+
+       if ((uio->uio_extflg == UIO_XUIO) &&
+           (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+               int nblk;
+               int blksz = zp->z_blksz;
+               uint64_t offset = uio->uio_loffset;
+
+               xuio = (xuio_t *)uio;
+               if ((ISP2(blksz))) {
+                       nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+                           blksz)) / blksz;
+               } else {
+                       ASSERT(offset + n <= blksz);
+                       nblk = 1;
+               }
+               (void) dmu_xuio_init(xuio, nblk);
+
+               if (vn_has_cached_data(vp)) {
+                       /*
+                        * For simplicity, we always allocate a full buffer
+                        * even if we only expect to read a portion of a block.
+                        */
+                       while (--nblk >= 0) {
+                               (void) dmu_xuio_add(xuio,
+                                   dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+                                   blksz), 0, blksz);
+                       }
+               }
+       }
 
        while (n > 0) {
                nbytes = MIN(n, zfs_read_chunk_size -
@@ -524,7 +555,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 
                n -= nbytes;
        }
-
 out:
        zfs_range_unlock(rl);
 
@@ -551,6 +581,7 @@ out:
  * Timestamps:
  *     vp - ctime|mtime updated if byte count > 0
  */
+
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -567,9 +598,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        ssize_t         n, nbytes;
        rl_t            *rl;
        int             max_blksz = zfsvfs->z_max_blksz;
-       uint64_t        pflags;
        int             error;
        arc_buf_t       *abuf;
+       iovec_t         *aiov;
+       xuio_t          *xuio = NULL;
+       int             i_iov = 0;
+       int             iovcnt = uio->uio_iovcnt;
+       iovec_t         *iovp = uio->uio_iov;
+       int             write_eof;
+       int             count = 0;
+       sa_bulk_attr_t  bulk[4];
+       uint64_t        mtime[2], ctime[2];
 
        /*
         * Fasttrack empty write
@@ -584,13 +623,19 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+           &zp->z_size, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, 8);
+
        /*
         * If immutable or not appending then return EPERM
         */
-       pflags = zp->z_phys->zp_flags;
-       if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
-           ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
-           (uio->uio_loffset < zp->z_phys->zp_size))) {
+       if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+           ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+           (uio->uio_loffset < zp->z_size))) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
        }
@@ -598,44 +643,59 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        zilog = zfsvfs->z_log;
 
        /*
+        * Validate file offset
+        */
+       woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+       if (woff < 0) {
+               ZFS_EXIT(zfsvfs);
+               return (EINVAL);
+       }
+
+       /*
+        * Check for mandatory locks before calling zfs_range_lock()
+        * in order to prevent a deadlock with locks set via fcntl().
+        */
+       if (MANDMODE((mode_t)zp->z_mode) &&
+           (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
+
+       /*
         * Pre-fault the pages to ensure slow (eg NFS) pages
         * don't hold up txg.
+        * Skip this if uio contains loaned arc_buf.
         */
-       uio_prefaultpages(n, uio);
+       if ((uio->uio_extflg == UIO_XUIO) &&
+           (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+               xuio = (xuio_t *)uio;
+       else
+               uio_prefaultpages(n, uio);
 
        /*
         * If in append mode, set the io offset pointer to eof.
         */
        if (ioflag & FAPPEND) {
                /*
-                * Range lock for a file append:
-                * The value for the start of range will be determined by
-                * zfs_range_lock() (to guarantee append semantics).
-                * If this write will cause the block size to increase,
-                * zfs_range_lock() will lock the entire file, so we must
-                * later reduce the range after we grow the block size.
+                * Obtain an appending range lock to guarantee file append
+                * semantics.  We reset the write offset once we have the lock.
                 */
                rl = zfs_range_lock(zp, 0, n, RL_APPEND);
+               woff = rl->r_off;
                if (rl->r_len == UINT64_MAX) {
-                       /* overlocked, zp_size can't change */
-                       woff = uio->uio_loffset = zp->z_phys->zp_size;
-               } else {
-                       woff = uio->uio_loffset = rl->r_off;
+                       /*
+                        * We overlocked the file because this write will cause
+                        * the file block size to increase.
+                        * Note that zp_size cannot change with this lock held.
+                        */
+                       woff = zp->z_size;
                }
+               uio->uio_loffset = woff;
        } else {
-               woff = uio->uio_loffset;
-               /*
-                * Validate file offset
-                */
-               if (woff < 0) {
-                       ZFS_EXIT(zfsvfs);
-                       return (EINVAL);
-               }
-
                /*
-                * If we need to grow the block size then zfs_range_lock()
-                * will lock a wider range than we request here.
-                * Later after growing the block size we reduce the range.
+                * Note that if the file block size will change as a result of
+                * this write, then this range lock will lock the entire file
+                * so that we can re-write the block safely.
                 */
                rl = zfs_range_lock(zp, woff, n, RL_WRITER);
        }
@@ -649,16 +709,10 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        if ((woff + n) > limit || woff > (limit - n))
                n = limit - woff;
 
-       /*
-        * Check for mandatory locks
-        */
-       if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
-           (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
-               zfs_range_unlock(rl);
-               ZFS_EXIT(zfsvfs);
-               return (error);
-       }
-       end_size = MAX(zp->z_phys->zp_size, woff + n);
+       /* Will this write extend the file length? */
+       write_eof = (woff + n > zp->z_size);
+
+       end_size = MAX(zp->z_size, woff + n);
 
        /*
         * Write the file in reasonable size chunks.  Each chunk is written
@@ -668,31 +722,41 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
        while (n > 0) {
                abuf = NULL;
                woff = uio->uio_loffset;
-
 again:
-               if (zfs_usergroup_overquota(zfsvfs,
-                   B_FALSE, zp->z_phys->zp_uid) ||
-                   zfs_usergroup_overquota(zfsvfs,
-                   B_TRUE, zp->z_phys->zp_gid)) {
+               if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+                   zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
                        if (abuf != NULL)
                                dmu_return_arcbuf(abuf);
                        error = EDQUOT;
                        break;
                }
 
-               /*
-                * If dmu_assign_arcbuf() is expected to execute with minimum
-                * overhead loan an arc buffer and copy user data to it before
-                * we enter a txg.  This avoids holding a txg forever while we
-                * pagefault on a hanging NFS server mapping.
-                */
-               if (abuf == NULL && n >= max_blksz &&
-                   woff >= zp->z_phys->zp_size &&
+               if (xuio && abuf == NULL) {
+                       ASSERT(i_iov < iovcnt);
+                       aiov = &iovp[i_iov];
+                       abuf = dmu_xuio_arcbuf(xuio, i_iov);
+                       dmu_xuio_clear(xuio, i_iov);
+                       DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+                           iovec_t *, aiov, arc_buf_t *, abuf);
+                       ASSERT((aiov->iov_base == abuf->b_data) ||
+                           ((char *)aiov->iov_base - (char *)abuf->b_data +
+                           aiov->iov_len == arc_buf_size(abuf)));
+                       i_iov++;
+               } else if (abuf == NULL && n >= max_blksz &&
+                   woff >= zp->z_size &&
                    P2PHASE(woff, max_blksz) == 0 &&
                    zp->z_blksz == max_blksz) {
+                       /*
+                        * This write covers a full block.  "Borrow" a buffer
+                        * from the dmu so that we can fill it before we enter
+                        * a transaction.  This avoids the possibility of
+                        * holding up the transaction if the data copy hangs
+                        * up on a pagefault (e.g., from an NFS server mapping).
+                        */
                        size_t cbytes;
 
-                       abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+                       abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+                           max_blksz);
                        ASSERT(abuf != NULL);
                        ASSERT(arc_buf_size(abuf) == max_blksz);
                        if (error = uiocopy(abuf->b_data, max_blksz,
@@ -707,8 +771,9 @@ again:
                 * Start a transaction.
                 */
                tx = dmu_tx_create(zfsvfs->z_os);
-               dmu_tx_hold_bonus(tx, zp->z_id);
+               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
                dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+               zfs_sa_upgrade_txholds(tx, zp);
                error = dmu_tx_assign(tx, TXG_NOWAIT);
                if (error) {
                        if (error == ERESTART) {
@@ -749,13 +814,30 @@ again:
 
                if (abuf == NULL) {
                        tx_bytes = uio->uio_resid;
-                       error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
-                           nbytes, tx);
+                       error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+                           uio, nbytes, tx);
                        tx_bytes -= uio->uio_resid;
                } else {
                        tx_bytes = nbytes;
-                       ASSERT(tx_bytes == max_blksz);
-                       dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+                       ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+                       /*
+                        * If this is not a full block write, but we are
+                        * extending the file past EOF and this data starts
+                        * block-aligned, use assign_arcbuf().  Otherwise,
+                        * write via dmu_write().
+                        */
+                       if (tx_bytes < max_blksz && (!write_eof ||
+                           aiov->iov_base != abuf->b_data)) {
+                               ASSERT(xuio);
+                               dmu_write(zfsvfs->z_os, zp->z_id, woff,
+                                   aiov->iov_len, aiov->iov_base, tx);
+                               dmu_return_arcbuf(abuf);
+                               xuio_stat_wbuf_copied();
+                       } else {
+                               ASSERT(xuio || tx_bytes == max_blksz);
+                               dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
+                                   woff, abuf, tx);
+                       }
                        ASSERT(tx_bytes <= uio->uio_resid);
                        uioskip(uio, tx_bytes);
                }
@@ -769,6 +851,8 @@ again:
                 * partial progress, update the znode and ZIL accordingly.
                 */
                if (tx_bytes == 0) {
+                       (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+                           (void *)&zp->z_size, sizeof (uint64_t), tx);
                        dmu_tx_commit(tx);
                        ASSERT(error != 0);
                        break;
@@ -782,33 +866,35 @@ again:
                 * been done, but that would still expose the ISUID/ISGID
                 * to another app after the partial write is committed.
                 *
-                * Note: we don't call zfs_fuid_map_id() here because
-                * user 0 is not an ephemeral uid.
                 */
                mutex_enter(&zp->z_acl_lock);
-               if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+               if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
                    (S_IXUSR >> 6))) != 0 &&
-                   (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+                   (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
                    secpolicy_vnode_setid_retain(cr,
-                   (zp->z_phys->zp_mode & S_ISUID) != 0 &&
-                   zp->z_phys->zp_uid == 0) != 0) {
-                       zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+                   (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+                       uint64_t newmode;
+                       zp->z_mode &= ~(S_ISUID | S_ISGID);
+                       newmode = zp->z_mode;
+                       (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+                           (void *)&newmode, sizeof (uint64_t), tx);
                }
                mutex_exit(&zp->z_acl_lock);
 
-               /*
-                * Update time stamp.  NOTE: This marks the bonus buffer as
-                * dirty, so we don't have to do it again for zp_size.
-                */
-               zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+               zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+                   B_TRUE);
 
                /*
                 * Update the file size (zp_size) if it has changed;
                 * account for possible concurrent updates.
                 */
-               while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
-                       (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+               while ((end_size = zp->z_size) < uio->uio_loffset) {
+                       (void) atomic_cas_64(&zp->z_size, end_size,
                            uio->uio_loffset);
+                       ASSERT(error == 0);
+               }
+               error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
                zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
                dmu_tx_commit(tx);
 
@@ -829,7 +915,8 @@ again:
                return (error);
        }
 
-       if (ioflag & (FSYNC | FDSYNC))
+       if (ioflag & (FSYNC | FDSYNC) ||
+           zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zilog, zp->z_last_itx, zp->z_id);
 
        ZFS_EXIT(zfsvfs);
@@ -837,21 +924,25 @@ again:
 }
 
 void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
+zfs_get_done(zgd_t *zgd, int error)
 {
-       zgd_t *zgd = (zgd_t *)vzgd;
-       rl_t *rl = zgd->zgd_rl;
-       vnode_t *vp = ZTOV(rl->r_zp);
-       objset_t *os = rl->r_zp->z_zfsvfs->z_os;
+       znode_t *zp = zgd->zgd_private;
+       objset_t *os = zp->z_zfsvfs->z_os;
+
+       if (zgd->zgd_db)
+               dmu_buf_rele(zgd->zgd_db, zgd);
+
+       zfs_range_unlock(zgd->zgd_rl);
 
-       dmu_buf_rele(db, vzgd);
-       zfs_range_unlock(rl);
        /*
         * Release the vnode asynchronously as we currently have the
         * txg stopped from syncing.
         */
-       VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
-       zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+       VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+       if (error == 0 && zgd->zgd_bp)
+               zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
        kmem_free(zgd, sizeof (zgd_t));
 }
 
@@ -868,20 +959,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        zfsvfs_t *zfsvfs = arg;
        objset_t *os = zfsvfs->z_os;
        znode_t *zp;
-       uint64_t off = lr->lr_offset;
+       uint64_t object = lr->lr_foid;
+       uint64_t offset = lr->lr_offset;
+       uint64_t size = lr->lr_length;
+       blkptr_t *bp = &lr->lr_blkptr;
        dmu_buf_t *db;
-       rl_t *rl;
        zgd_t *zgd;
-       int dlen = lr->lr_length;               /* length of user data */
        int error = 0;
 
-       ASSERT(zio);
-       ASSERT(dlen != 0);
+       ASSERT(zio != NULL);
+       ASSERT(size != 0);
 
        /*
         * Nothing to do if the file has been removed
         */
-       if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+       if (zfs_zget(zfsvfs, object, &zp) != 0)
                return (ENOENT);
        if (zp->z_unlinked) {
                /*
@@ -893,6 +985,10 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
                return (ENOENT);
        }
 
+       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+       zgd->zgd_zilog = zfsvfs->z_log;
+       zgd->zgd_private = zp;
+
        /*
         * Write records come in two flavors: immediate and indirect.
         * For small writes it's cheaper to store the data with the
@@ -901,17 +997,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
         * we don't have to write the data twice.
         */
        if (buf != NULL) { /* immediate write */
-               rl = zfs_range_lock(zp, off, dlen, RL_READER);
+               zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
                /* test for truncation needs to be done while range locked */
-               if (off >= zp->z_phys->zp_size) {
+               if (offset >= zp->z_size) {
                        error = ENOENT;
-                       goto out;
+               } else {
+                       error = dmu_read(os, object, offset, size, buf,
+                           DMU_READ_NO_PREFETCH);
                }
-               VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
-                   DMU_READ_NO_PREFETCH));
+               ASSERT(error == 0 || error == ENOENT);
        } else { /* indirect write */
-               uint64_t boff; /* block starting offset */
-
                /*
                 * Have to lock the whole block to ensure when it's
                 * written out and it's checksum is being calculated
@@ -919,68 +1014,59 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
                 * blocksize after we get the lock in case it's changed!
                 */
                for (;;) {
-                       if (ISP2(zp->z_blksz)) {
-                               boff = P2ALIGN_TYPED(off, zp->z_blksz,
-                                   uint64_t);
-                       } else {
-                               boff = 0;
-                       }
-                       dlen = zp->z_blksz;
-                       rl = zfs_range_lock(zp, boff, dlen, RL_READER);
-                       if (zp->z_blksz == dlen)
+                       uint64_t blkoff;
+                       size = zp->z_blksz;
+                       blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+                       offset -= blkoff;
+                       zgd->zgd_rl = zfs_range_lock(zp, offset, size,
+                           RL_READER);
+                       if (zp->z_blksz == size)
                                break;
-                       zfs_range_unlock(rl);
+                       offset += blkoff;
+                       zfs_range_unlock(zgd->zgd_rl);
                }
                /* test for truncation needs to be done while range locked */
-               if (off >= zp->z_phys->zp_size) {
+               if (lr->lr_offset >= zp->z_size)
                        error = ENOENT;
-                       goto out;
-               }
-               zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-               zgd->zgd_rl = rl;
-               zgd->zgd_zilog = zfsvfs->z_log;
-               zgd->zgd_bp = &lr->lr_blkptr;
 #ifdef DEBUG
                if (zil_fault_io) {
                        error = EIO;
                        zil_fault_io = 0;
-               } else {
-                       error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
                }
-#else
-               error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
 #endif
-               if (error != 0) {
-                       kmem_free(zgd, sizeof (zgd_t));
-                       goto out;
-               }
-
-               ASSERT(boff == db->db_offset);
-               lr->lr_blkoff = off - boff;
-               error = dmu_sync(zio, db, &lr->lr_blkptr,
-                   lr->lr_common.lrc_txg, zfs_get_done, zgd);
-               ASSERT((error && error != EINPROGRESS) ||
-                   lr->lr_length <= zp->z_blksz);
                if (error == 0)
-                       zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
-               /*
-                * If we get EINPROGRESS, then we need to wait for a
-                * write IO initiated by dmu_sync() to complete before
-                * we can release this dbuf.  We will finish everything
-                * up in the zfs_get_done() callback.
-                */
-               if (error == EINPROGRESS)
-                       return (0);
-               dmu_buf_rele(db, zgd);
-               kmem_free(zgd, sizeof (zgd_t));
+                       error = dmu_buf_hold(os, object, offset, zgd, &db,
+                           DMU_READ_NO_PREFETCH);
+
+               if (error == 0) {
+                       zgd->zgd_db = db;
+                       zgd->zgd_bp = bp;
+
+                       ASSERT(db->db_offset == offset);
+                       ASSERT(db->db_size == size);
+
+                       error = dmu_sync(zio, lr->lr_common.lrc_txg,
+                           zfs_get_done, zgd);
+                       ASSERT(error || lr->lr_length <= zp->z_blksz);
+
+                       /*
+                        * On success, we need to wait for the write I/O
+                        * initiated by dmu_sync() to complete before we can
+                        * release this dbuf.  We will finish everything up
+                        * in the zfs_get_done() callback.
+                        */
+                       if (error == 0)
+                               return (0);
+
+                       if (error == EALREADY) {
+                               lr->lr_common.lrc_txtype = TX_WRITE2;
+                               error = 0;
+                       }
+               }
        }
-out:
-       zfs_range_unlock(rl);
-       /*
-        * Release the vnode asynchronously as we currently have the
-        * txg stopped from syncing.
-        */
-       VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+       zfs_get_done(zgd, error);
+
        return (error);
 }
 
@@ -1063,7 +1149,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
 
                if (dvp->v_type != VDIR) {
                        return (ENOTDIR);
-               } else if (zdp->z_dbuf == NULL) {
+               } else if (zdp->z_sa_hdl == NULL) {
                        return (EIO);
                }
 
@@ -1115,7 +1201,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
                 * We don't allow recursive attributes..
                 * Maybe someday we will.
                 */
-               if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+               if (zdp->z_pflags & ZFS_XATTR) {
                        ZFS_EXIT(zfsvfs);
                        return (EINVAL);
                }
@@ -1208,8 +1294,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
        ksid_t          *ksid;
        uid_t           uid;
        gid_t           gid = crgetgid(cr);
-       zfs_acl_ids_t   acl_ids;
+       zfs_acl_ids_t   acl_ids;
        boolean_t       fuid_dirtied;
+       boolean_t       have_acl = B_FALSE;
 
        /*
         * If we have an ephemeral id, ACL, or XVATTR then
@@ -1275,6 +1362,7 @@ top:
                        return (error);
                }
        }
+
        if (zp == NULL) {
                uint64_t txtype;
 
@@ -1290,15 +1378,18 @@ top:
                 * We only support the creation of regular files in
                 * extended attribute directories.
                 */
-               if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+
+               if ((dzp->z_pflags & ZFS_XATTR) &&
                    (vap->va_type != VREG)) {
                        error = EINVAL;
                        goto out;
                }
 
-               if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
-                   &acl_ids)) != 0)
+               if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+                   cr, vsecp, &acl_ids)) != 0)
                        goto out;
+               have_acl = B_TRUE;
+
                if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
                        zfs_acl_ids_free(&acl_ids);
                        error = EDQUOT;
@@ -1306,36 +1397,39 @@ top:
                }
 
                tx = dmu_tx_create(os);
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+               dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+                   ZFS_SA_BASE_ATTR_SIZE);
+
                fuid_dirtied = zfsvfs->z_fuid_dirty;
                if (fuid_dirtied)
                        zfs_fuid_txhold(zfsvfs, tx);
-               dmu_tx_hold_bonus(tx, dzp->z_id);
                dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-               if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+               dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+               if (!zfsvfs->z_use_sa &&
+                   acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
                        dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-                           0, SPA_MAXBLOCKSIZE);
+                           0, acl_ids.z_aclp->z_acl_bytes);
                }
                error = dmu_tx_assign(tx, TXG_NOWAIT);
                if (error) {
-                       zfs_acl_ids_free(&acl_ids);
                        zfs_dirent_unlock(dl);
                        if (error == ERESTART) {
                                dmu_tx_wait(tx);
                                dmu_tx_abort(tx);
                                goto top;
                        }
+                       zfs_acl_ids_free(&acl_ids);
                        dmu_tx_abort(tx);
                        ZFS_EXIT(zfsvfs);
                        return (error);
                }
-               zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+               zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
                if (fuid_dirtied)
                        zfs_fuid_sync(zfsvfs, tx);
 
                (void) zfs_link_create(dl, zp, tx, ZNEW);
-
                txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
                if (flag & FIGNORECASE)
                        txtype |= TX_CI;
@@ -1401,6 +1495,9 @@ out:
                error = specvp_check(vpp, cr);
        }
 
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -1421,6 +1518,9 @@ out:
  *     dvp - ctime|mtime
  *      vp - ctime (if nlink > 0)
  */
+
+uint64_t null_xattr = 0;
+
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
@@ -1431,7 +1531,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
        vnode_t         *vp;
        zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
        zilog_t         *zilog;
-       uint64_t        acl_obj, xattr_obj;
+       uint64_t        acl_obj, xattr_obj = 0;
+       uint64_t        xattr_obj_unlinked = 0;
        zfs_dirlock_t   *dl;
        dmu_tx_t        *tx;
        boolean_t       may_delete_now, delete_now = FALSE;
@@ -1497,24 +1598,29 @@ top:
         */
        tx = dmu_tx_create(zfsvfs->z_os);
        dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+       zfs_sa_upgrade_txholds(tx, zp);
+       zfs_sa_upgrade_txholds(tx, dzp);
        if (may_delete_now) {
                toobig =
-                   zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+                   zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
                /* if the file is too big, only hold_free a token amount */
                dmu_tx_hold_free(tx, zp->z_id, 0,
                    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
        }
 
        /* are there any extended attributes? */
-       if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
-               /* XXX - do we need this if we are deleting? */
-               dmu_tx_hold_bonus(tx, xattr_obj);
+       error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+           &xattr_obj, sizeof (xattr_obj));
+       if (xattr_obj) {
+               error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+               ASSERT3U(error, ==, 0);
+               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+               dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
        }
 
        /* are there any additional acls */
-       if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
-           may_delete_now)
+       if ((acl_obj = ZFS_EXTERNAL_ACL(zp)) != 0 && may_delete_now)
                dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
        /* charge as an update -- would be nice not to charge at all */
@@ -1547,26 +1653,37 @@ top:
        }
 
        if (unlinked) {
+
                mutex_enter(&vp->v_lock);
+
+               (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+                   &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
                delete_now = may_delete_now && !toobig &&
                    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-                   zp->z_phys->zp_xattr == xattr_obj &&
-                   zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+                   xattr_obj == xattr_obj_unlinked && ZFS_EXTERNAL_ACL(zp) ==
+                   acl_obj;
                mutex_exit(&vp->v_lock);
        }
 
        if (delete_now) {
-               if (zp->z_phys->zp_xattr) {
-                       error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
-                       ASSERT3U(error, ==, 0);
-                       ASSERT3U(xzp->z_phys->zp_links, ==, 2);
-                       dmu_buf_will_dirty(xzp->z_dbuf, tx);
+               if (xattr_obj_unlinked) {
+                       ASSERT3U(xzp->z_links, ==, 2);
                        mutex_enter(&xzp->z_lock);
                        xzp->z_unlinked = 1;
-                       xzp->z_phys->zp_links = 0;
+                       xzp->z_links = 0;
+                       error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+                           &xzp->z_links, sizeof (xzp->z_links), tx);
+                       ASSERT3U(error,  ==,  0);
                        mutex_exit(&xzp->z_lock);
                        zfs_unlinked_add(xzp, tx);
-                       zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+                       if (zp->z_is_sa)
+                               error = sa_remove(zp->z_sa_hdl,
+                                   SA_ZPL_XATTR(zfsvfs), tx);
+                       else
+                               error = sa_update(zp->z_sa_hdl,
+                                   SA_ZPL_XATTR(zfsvfs), &null_xattr,
+                                   sizeof (uint64_t), tx);
+                       ASSERT3U(error, ==, 0);
                }
                mutex_enter(&zp->z_lock);
                mutex_enter(&vp->v_lock);
@@ -1591,12 +1708,13 @@ out:
 
        zfs_dirent_unlock(dl);
 
-       if (!delete_now) {
+       if (!delete_now)
                VN_RELE(vp);
-       } else if (xzp) {
-               /* this rele is delayed to prevent nesting transactions */
+       if (xzp)
                VN_RELE(ZTOV(xzp));
-       }
+
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -1638,7 +1756,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
        ksid_t          *ksid;
        uid_t           uid;
        gid_t           gid = crgetgid(cr);
-       zfs_acl_ids_t   acl_ids;
+       zfs_acl_ids_t   acl_ids;
        boolean_t       fuid_dirtied;
 
        ASSERT(vap->va_type == VDIR);
@@ -1662,7 +1780,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
        ZFS_VERIFY_ZP(dzp);
        zilog = zfsvfs->z_log;
 
-       if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+       if (dzp->z_pflags & ZFS_XATTR) {
                ZFS_EXIT(zfsvfs);
                return (EINVAL);
        }
@@ -1675,37 +1793,43 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
        if (flags & FIGNORECASE)
                zf |= ZCILOOK;
 
-       if (vap->va_mask & AT_XVATTR)
+       if (vap->va_mask & AT_XVATTR) {
                if ((error = secpolicy_xvattr((xvattr_t *)vap,
                    crgetuid(cr), cr, vap->va_type)) != 0) {
                        ZFS_EXIT(zfsvfs);
                        return (error);
                }
+       }
 
+       if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+           vsecp, &acl_ids)) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
        /*
         * First make sure the new directory doesn't exist.
+        *
+        * Existence is checked first to make sure we don't return
+        * EACCES instead of EEXIST which can cause some applications
+        * to fail.
         */
 top:
        *vpp = NULL;
 
        if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
            NULL, NULL)) {
+               zfs_acl_ids_free(&acl_ids);
                ZFS_EXIT(zfsvfs);
                return (error);
        }
 
        if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
+               zfs_acl_ids_free(&acl_ids);
                zfs_dirent_unlock(dl);
                ZFS_EXIT(zfsvfs);
                return (error);
        }
 
-       if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
-           &acl_ids)) != 0) {
-               zfs_dirent_unlock(dl);
-               ZFS_EXIT(zfsvfs);
-               return (error);
-       }
        if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
                zfs_acl_ids_free(&acl_ids);
                zfs_dirent_unlock(dl);
@@ -1722,18 +1846,23 @@ top:
        fuid_dirtied = zfsvfs->z_fuid_dirty;
        if (fuid_dirtied)
                zfs_fuid_txhold(zfsvfs, tx);
-       if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
-               dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-                   0, SPA_MAXBLOCKSIZE);
+       if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+                   acl_ids.z_aclp->z_acl_bytes);
+       }
+
+       dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+           ZFS_SA_BASE_ATTR_SIZE);
+
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
-               zfs_acl_ids_free(&acl_ids);
                zfs_dirent_unlock(dl);
                if (error == ERESTART) {
                        dmu_tx_wait(tx);
                        dmu_tx_abort(tx);
                        goto top;
                }
+               zfs_acl_ids_free(&acl_ids);
                dmu_tx_abort(tx);
                ZFS_EXIT(zfsvfs);
                return (error);
@@ -1742,10 +1871,11 @@ top:
        /*
         * Create new node.
         */
-       zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+       zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
        if (fuid_dirtied)
                zfs_fuid_sync(zfsvfs, tx);
+
        /*
         * Now put new name in parent dir.
         */
@@ -1760,10 +1890,14 @@ top:
            acl_ids.z_fuidp, vap);
 
        zfs_acl_ids_free(&acl_ids);
+
        dmu_tx_commit(tx);
 
        zfs_dirent_unlock(dl);
 
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (0);
 }
@@ -1851,8 +1985,10 @@ top:
 
        tx = dmu_tx_create(zfsvfs->z_os);
        dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
        dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+       zfs_sa_upgrade_txholds(tx, zp);
+       zfs_sa_upgrade_txholds(tx, dzp);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
                rw_exit(&zp->z_parent_lock);
@@ -1887,6 +2023,9 @@ out:
 
        VN_RELE(vp);
 
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -1934,6 +2073,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
        zap_attribute_t zap;
        uint_t          bytes_wanted;
        uint64_t        offset; /* must be unsigned; checks for < 1 */
+       uint64_t        parent;
        int             local_eof;
        int             outcount;
        int             error;
@@ -1943,6 +2083,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
+       if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+           &parent, sizeof (parent))) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
+
        /*
         * If we are not given an eof variable,
         * use a local one.
@@ -2030,7 +2176,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
                } else if (offset == 1) {
                        (void) strcpy(zap.za_name, "..");
                        zap.za_normalization_conflict = 0;
-                       objnum = zp->z_phys->zp_parent;
+                       objnum = parent;
                } else if (offset == 2 && zfs_show_ctldir(zp)) {
                        (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
                        zap.za_normalization_conflict = 0;
@@ -2194,10 +2340,12 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 
        (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
-       ZFS_ENTER(zfsvfs);
-       ZFS_VERIFY_ZP(zp);
-       zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
-       ZFS_EXIT(zfsvfs);
+       if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+               ZFS_ENTER(zfsvfs);
+               ZFS_VERIFY_ZP(zp);
+               zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+               ZFS_EXIT(zfsvfs);
+       }
        return (0);
 }
 
@@ -2224,24 +2372,32 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 {
        znode_t *zp = VTOZ(vp);
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-       znode_phys_t *pzp;
        int     error = 0;
        uint64_t links;
+       uint64_t mtime[2], ctime[2];
        xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
        xoptattr_t *xoap = NULL;
        boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+       sa_bulk_attr_t bulk[2];
+       int count = 0;
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
-       pzp = zp->z_phys;
+
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+       if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
 
        /*
         * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
         * Also, if we are the owner don't bother, since owner should
         * always be allowed to read basic attributes of file.
         */
-       if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
-           (pzp->zp_uid != crgetuid(cr))) {
+       if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (zp->z_uid != crgetuid(cr))) {
                if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
                    skipaclchk, cr)) {
                        ZFS_EXIT(zfsvfs);
@@ -2256,16 +2412,17 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 
        mutex_enter(&zp->z_lock);
        vap->va_type = vp->v_type;
-       vap->va_mode = pzp->zp_mode & MODEMASK;
-       zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+       vap->va_mode = zp->z_mode & MODEMASK;
+       vap->va_uid = zp->z_uid;
+       vap->va_gid = zp->z_gid;
        vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
        vap->va_nodeid = zp->z_id;
        if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
-               links = pzp->zp_links + 1;
+               links = zp->z_links + 1;
        else
-               links = pzp->zp_links;
+               links = zp->z_links;
        vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
-       vap->va_size = pzp->zp_size;
+       vap->va_size = zp->z_size;
        vap->va_rdev = vp->v_rdev;
        vap->va_seq = zp->z_seq;
 
@@ -2276,109 +2433,97 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
                if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
                        xoap->xoa_archive =
-                           ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
+                           ((zp->z_pflags & ZFS_ARCHIVE) != 0);
                        XVA_SET_RTN(xvap, XAT_ARCHIVE);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
                        xoap->xoa_readonly =
-                           ((pzp->zp_flags & ZFS_READONLY) != 0);
+                           ((zp->z_pflags & ZFS_READONLY) != 0);
                        XVA_SET_RTN(xvap, XAT_READONLY);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
                        xoap->xoa_system =
-                           ((pzp->zp_flags & ZFS_SYSTEM) != 0);
+                           ((zp->z_pflags & ZFS_SYSTEM) != 0);
                        XVA_SET_RTN(xvap, XAT_SYSTEM);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
                        xoap->xoa_hidden =
-                           ((pzp->zp_flags & ZFS_HIDDEN) != 0);
+                           ((zp->z_pflags & ZFS_HIDDEN) != 0);
                        XVA_SET_RTN(xvap, XAT_HIDDEN);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
                        xoap->xoa_nounlink =
-                           ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
+                           ((zp->z_pflags & ZFS_NOUNLINK) != 0);
                        XVA_SET_RTN(xvap, XAT_NOUNLINK);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
                        xoap->xoa_immutable =
-                           ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
+                           ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
                        XVA_SET_RTN(xvap, XAT_IMMUTABLE);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
                        xoap->xoa_appendonly =
-                           ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
+                           ((zp->z_pflags & ZFS_APPENDONLY) != 0);
                        XVA_SET_RTN(xvap, XAT_APPENDONLY);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
                        xoap->xoa_nodump =
-                           ((pzp->zp_flags & ZFS_NODUMP) != 0);
+                           ((zp->z_pflags & ZFS_NODUMP) != 0);
                        XVA_SET_RTN(xvap, XAT_NODUMP);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
                        xoap->xoa_opaque =
-                           ((pzp->zp_flags & ZFS_OPAQUE) != 0);
+                           ((zp->z_pflags & ZFS_OPAQUE) != 0);
                        XVA_SET_RTN(xvap, XAT_OPAQUE);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
                        xoap->xoa_av_quarantined =
-                           ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
+                           ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
                        XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
                        xoap->xoa_av_modified =
-                           ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
+                           ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
                        XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
-                   vp->v_type == VREG &&
-                   (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
-                       size_t len;
-                       dmu_object_info_t doi;
-
-                       /*
-                        * Only VREG files have anti-virus scanstamps, so we
-                        * won't conflict with symlinks in the bonus buffer.
-                        */
-                       dmu_object_info_from_db(zp->z_dbuf, &doi);
-                       len = sizeof (xoap->xoa_av_scanstamp) +
-                           sizeof (znode_phys_t);
-                       if (len <= doi.doi_bonus_size) {
-                               /*
-                                * pzp points to the start of the
-                                * znode_phys_t. pzp + 1 points to the
-                                * first byte after the znode_phys_t.
-                                */
-                               (void) memcpy(xoap->xoa_av_scanstamp,
-                                   pzp + 1,
-                                   sizeof (xoap->xoa_av_scanstamp));
-                               XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
-                       }
+                   vp->v_type == VREG) {
+                       zfs_sa_get_scanstamp(zp, xvap);
                }
 
                if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
-                       ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
+                       uint64_t times[2];
+
+                       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+                           times, sizeof (times));
+                       ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
                        XVA_SET_RTN(xvap, XAT_CREATETIME);
                }
+
+               if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+                       xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+                       XVA_SET_RTN(xvap, XAT_REPARSE);
+               }
        }
 
-       ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
-       ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
-       ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+       ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+       ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+       ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 
        mutex_exit(&zp->z_lock);
 
-       dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
+       sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
 
        if (zp->z_blksz == 0) {
                /*
@@ -2415,7 +2560,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        caller_context_t *ct)
 {
        znode_t         *zp = VTOZ(vp);
-       znode_phys_t    *pzp;
        zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
        zilog_t         *zilog;
        dmu_tx_t        *tx;
@@ -2426,15 +2570,19 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        int             trim_mask = 0;
        uint64_t        new_mode;
        uint64_t        new_uid, new_gid;
+       uint64_t        xattr_obj = 0;
+       uint64_t        mtime[2], ctime[2];
        znode_t         *attrzp;
        int             need_policy = FALSE;
-       int             err;
+       int             err, err2;
        zfs_fuid_info_t *fuidp = NULL;
        xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
        xoptattr_t      *xoap;
        zfs_acl_t       *aclp = NULL;
        boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-       boolean_t fuid_dirtied = B_FALSE;
+       boolean_t       fuid_dirtied = B_FALSE;
+       sa_bulk_attr_t  bulk[7], xattr_bulk[7];
+       int             count = 0, xattr_count = 0;
 
        if (mask == 0)
                return (0);
@@ -2445,7 +2593,6 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
-       pzp = zp->z_phys;
        zilog = zfsvfs->z_log;
 
        /*
@@ -2482,14 +2629,14 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        /*
         * Immutable files can only alter immutable bit and atime
         */
-       if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
+       if ((zp->z_pflags & ZFS_IMMUTABLE) &&
            ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
            ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
        }
 
-       if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
+       if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
        }
@@ -2546,9 +2693,10 @@ top:
            XVA_ISSET_REQ(xvap, XAT_READONLY) ||
            XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
            XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
-           XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
+           XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
                need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
                    skipaclchk, cr);
+       }
 
        if (mask & (AT_UID|AT_GID)) {
                int     idmask = (mask & (AT_UID|AT_GID));
@@ -2561,7 +2709,7 @@ top:
                 */
 
                if (!(mask & AT_MODE))
-                       vap->va_mode = pzp->zp_mode;
+                       vap->va_mode = zp->z_mode;
 
                /*
                 * Take ownership or chgrp to group we are a member of
@@ -2599,8 +2747,9 @@ top:
        }
 
        mutex_enter(&zp->z_lock);
-       oldva.va_mode = pzp->zp_mode;
-       zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+       oldva.va_mode = zp->z_mode;
+       oldva.va_uid = zp->z_uid;
+       oldva.va_gid = zp->z_gid;
        if (mask & AT_XVATTR) {
                /*
                 * Update xvattr mask to include only those attributes
@@ -2611,7 +2760,7 @@ top:
                 */
                if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
                        if (xoap->xoa_appendonly !=
-                           ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+                           ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
                                need_policy = TRUE;
                        } else {
                                XVA_CLR_REQ(xvap, XAT_APPENDONLY);
@@ -2621,7 +2770,7 @@ top:
 
                if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
                        if (xoap->xoa_nounlink !=
-                           ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+                           ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
                                need_policy = TRUE;
                        } else {
                                XVA_CLR_REQ(xvap, XAT_NOUNLINK);
@@ -2631,7 +2780,7 @@ top:
 
                if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
                        if (xoap->xoa_immutable !=
-                           ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+                           ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
                                need_policy = TRUE;
                        } else {
                                XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
@@ -2641,7 +2790,7 @@ top:
 
                if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
                        if (xoap->xoa_nodump !=
-                           ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+                           ((zp->z_pflags & ZFS_NODUMP) != 0)) {
                                need_policy = TRUE;
                        } else {
                                XVA_CLR_REQ(xvap, XAT_NODUMP);
@@ -2651,7 +2800,7 @@ top:
 
                if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
                        if (xoap->xoa_av_modified !=
-                           ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+                           ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
                                need_policy = TRUE;
                        } else {
                                XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
@@ -2663,7 +2812,7 @@ top:
                        if ((vp->v_type != VREG &&
                            xoap->xoa_av_quarantined) ||
                            xoap->xoa_av_quarantined !=
-                           ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+                           ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
                                need_policy = TRUE;
                        } else {
                                XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
@@ -2671,6 +2820,12 @@ top:
                        }
                }
 
+               if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+                       mutex_exit(&zp->z_lock);
+                       ZFS_EXIT(zfsvfs);
+                       return (EPERM);
+               }
+
                if (need_policy == FALSE &&
                    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
                    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
@@ -2724,79 +2879,84 @@ top:
         */
        mask = vap->va_mask;
 
+       if ((mask & (AT_UID | AT_GID))) {
+               (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj,
+                   sizeof (xattr_obj));
+
+               if (xattr_obj) {
+                       err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+                       if (err)
+                               goto out2;
+               }
+               if (mask & AT_UID) {
+                       new_uid = zfs_fuid_create(zfsvfs,
+                           (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+                       if (vap->va_uid != zp->z_uid &&
+                           zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+                               err = EDQUOT;
+                               goto out2;
+                       }
+               }
+
+               if (mask & AT_GID) {
+                       new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+                           cr, ZFS_GROUP, &fuidp);
+                       if (new_gid != zp->z_gid &&
+                           zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+                               err = EDQUOT;
+                               goto out2;
+                       }
+               }
+       }
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, zp->z_id);
 
        if (mask & AT_MODE) {
-               uint64_t pmode = pzp->zp_mode;
-
+               uint64_t pmode = zp->z_mode;
                new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
                if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
                        goto out;
-               if (pzp->zp_acl.z_acl_extern_obj) {
-                       /* Are we upgrading ACL from old V0 format to new V1 */
+
+               if (!zp->z_is_sa && ZFS_EXTERNAL_ACL(zp)) {
+                       /*
+                        * Are we upgrading ACL from old V0 format
+                        * to V1 format?
+                        */
                        if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
-                           pzp->zp_acl.z_acl_version ==
+                           ZNODE_ACL_VERSION(zp) ==
                            ZFS_ACL_VERSION_INITIAL) {
                                dmu_tx_hold_free(tx,
-                                   pzp->zp_acl.z_acl_extern_obj, 0,
+                                   ZFS_EXTERNAL_ACL(zp), 0,
                                    DMU_OBJECT_END);
                                dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                                    0, aclp->z_acl_bytes);
                        } else {
-                               dmu_tx_hold_write(tx,
-                                   pzp->zp_acl.z_acl_extern_obj, 0,
+                               dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), 0,
                                    aclp->z_acl_bytes);
                        }
-               } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+               } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
                        dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                            0, aclp->z_acl_bytes);
                }
+               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+       } else {
+               if ((mask & AT_XVATTR) &&
+                   XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+                       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+               else
+                       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
        }
 
-       if (mask & (AT_UID | AT_GID)) {
-               if (pzp->zp_xattr) {
-                       err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
-                       if (err)
-                               goto out;
-                       dmu_tx_hold_bonus(tx, attrzp->z_id);
-               }
-               if (mask & AT_UID) {
-                       new_uid = zfs_fuid_create(zfsvfs,
-                           (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
-                       if (new_uid != pzp->zp_uid &&
-                           zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
-                               err = EDQUOT;
-                               goto out;
-                       }
-               }
-
-               if (mask & AT_GID) {
-                       new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
-                           cr, ZFS_GROUP, &fuidp);
-                       if (new_gid != pzp->zp_gid &&
-                           zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
-                               err = EDQUOT;
-                               goto out;
-                       }
-               }
-               fuid_dirtied = zfsvfs->z_fuid_dirty;
-               if (fuid_dirtied) {
-                       if (zfsvfs->z_fuid_obj == 0) {
-                               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-                               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-                                   FUID_SIZE_ESTIMATE(zfsvfs));
-                               dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
-                                   FALSE, NULL);
-                       } else {
-                               dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-                               dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-                                   FUID_SIZE_ESTIMATE(zfsvfs));
-                       }
-               }
+       if (attrzp) {
+               dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
        }
 
+       fuid_dirtied = zfsvfs->z_fuid_dirty;
+       if (fuid_dirtied)
+               zfs_fuid_txhold(zfsvfs, tx);
+
+       zfs_sa_upgrade_txholds(tx, zp);
+
        err = dmu_tx_assign(tx, TXG_NOWAIT);
        if (err) {
                if (err == ERESTART)
@@ -2804,8 +2964,7 @@ top:
                goto out;
        }
 
-       dmu_buf_will_dirty(zp->z_dbuf, tx);
-
+       count = 0;
        /*
         * Set each attribute requested.
         * We group settings according to the locks they need to acquire.
@@ -2816,9 +2975,62 @@ top:
 
        mutex_enter(&zp->z_lock);
 
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, sizeof (zp->z_pflags));
+
+       if (attrzp) {
+               mutex_enter(&attrzp->z_lock);
+               SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+                   SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+                   sizeof (attrzp->z_pflags));
+       }
+
+       if (mask & (AT_UID|AT_GID)) {
+
+               if (mask & AT_UID) {
+                       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+                           &new_uid, sizeof (new_uid));
+                       zp->z_uid = zfs_fuid_map_id(zfsvfs, new_uid,
+                           cr, ZFS_OWNER);
+                       if (attrzp) {
+                               SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+                                   SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+                                   sizeof (new_uid));
+                               attrzp->z_uid = zp->z_uid;
+                       }
+               }
+
+               if (mask & AT_GID) {
+                       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+                           NULL, &new_gid, sizeof (new_gid));
+                       zp->z_gid = zfs_fuid_map_id(zfsvfs, new_gid, cr,
+                           ZFS_GROUP);
+                       if (attrzp) {
+                               SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+                                   SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+                                   sizeof (new_gid));
+                               attrzp->z_gid = zp->z_gid;
+                       }
+               }
+               if (!(mask & AT_MODE)) {
+                       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+                           NULL, &new_mode, sizeof (new_mode));
+                       new_mode = zp->z_mode;
+               }
+               err = zfs_acl_chown_setattr(zp);
+               ASSERT(err == 0);
+               if (attrzp) {
+                       err = zfs_acl_chown_setattr(attrzp);
+                       ASSERT(err == 0);
+               }
+       }
+
        if (mask & AT_MODE) {
                mutex_enter(&zp->z_acl_lock);
-               zp->z_phys->zp_mode = new_mode;
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+                   &new_mode, sizeof (new_mode));
+               zp->z_mode = new_mode;
+               ASSERT3U((uintptr_t)aclp, !=, NULL);
                err = zfs_aclset_common(zp, aclp, cr, tx);
                ASSERT3U(err, ==, 0);
                zp->z_acl_cached = aclp;
@@ -2827,34 +3039,41 @@ top:
        }
 
        if (attrzp)
-               mutex_enter(&attrzp->z_lock);
+               mutex_exit(&attrzp->z_lock);
 
-       if (mask & AT_UID) {
-               pzp->zp_uid = new_uid;
-               if (attrzp)
-                       attrzp->z_phys->zp_uid = new_uid;
+       if (mask & AT_ATIME) {
+               ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+                   &zp->z_atime, sizeof (zp->z_atime));
        }
 
-       if (mask & AT_GID) {
-               pzp->zp_gid = new_gid;
-               if (attrzp)
-                       attrzp->z_phys->zp_gid = new_gid;
+       if (mask & AT_MTIME) {
+               ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+                   mtime, sizeof (mtime));
        }
 
-       if (attrzp)
-               mutex_exit(&attrzp->z_lock);
-
-       if (mask & AT_ATIME)
-               ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
-       if (mask & AT_MTIME)
-               ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
        /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
-       if (mask & AT_SIZE)
-               zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
-       else if (mask != 0)
-               zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+       if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+                   NULL, mtime, sizeof (mtime));
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+                   &ctime, sizeof (ctime));
+               zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+                   B_TRUE);
+       } else if (mask != 0) {
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+                   &ctime, sizeof (ctime));
+               zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+                   B_TRUE);
+               if (attrzp) {
+                       SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+                           SA_ZPL_CTIME(zfsvfs), NULL,
+                           &ctime, sizeof (ctime));
+                       zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+                           mtime, ctime, B_TRUE);
+               }
+       }
        /*
         * Do this after setting timestamps to prevent timestamp
         * update from toggling bit
@@ -2886,20 +3105,10 @@ top:
                        XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
                }
 
-               if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
-                       size_t len;
-                       dmu_object_info_t doi;
-
+               if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
                        ASSERT(vp->v_type == VREG);
 
-                       /* Grow the bonus buffer if necessary. */
-                       dmu_object_info_from_db(zp->z_dbuf, &doi);
-                       len = sizeof (xoap->xoa_av_scanstamp) +
-                           sizeof (znode_phys_t);
-                       if (len > doi.doi_bonus_size)
-                               VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
-               }
-               zfs_xvattr_set(zp, xvap);
+               zfs_xvattr_set(zp, xvap, tx);
        }
 
        if (fuid_dirtied)
@@ -2911,9 +3120,14 @@ top:
        mutex_exit(&zp->z_lock);
 
 out:
+       if (err == 0 && attrzp) {
+               err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+                   xattr_count, tx);
+               ASSERT(err2 == 0);
+       }
+
        if (attrzp)
                VN_RELE(ZTOV(attrzp));
-
        if (aclp)
                zfs_acl_free(aclp);
 
@@ -2922,13 +3136,19 @@ out:
                fuidp = NULL;
        }
 
-       if (err)
+       if (err) {
                dmu_tx_abort(tx);
-       else
+               if (err == ERESTART)
+                       goto top;
+       } else {
+               err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
                dmu_tx_commit(tx);
+       }
+
 
-       if (err == ERESTART)
-               goto top;
+out2:
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
 
        ZFS_EXIT(zfsvfs);
        return (err);
@@ -2969,7 +3189,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
        zfs_zlock_t     *zl;
        znode_t         *zp = tdzp;
        uint64_t        rootid = zp->z_zfsvfs->z_root;
-       uint64_t        *oidp = &zp->z_id;
+       uint64_t        oidp = zp->z_id;
        krwlock_t       *rwlp = &szp->z_parent_lock;
        krw_t           rw = RW_WRITER;
 
@@ -2991,7 +3211,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
                                zfs_rename_unlock(&zl);
                                *zlpp = NULL;
                                zp = tdzp;
-                               oidp = &zp->z_id;
+                               oidp = zp->z_id;
                                rwlp = &szp->z_parent_lock;
                                rw = RW_WRITER;
                                continue;
@@ -3009,19 +3229,20 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
                zl->zl_next = *zlpp;
                *zlpp = zl;
 
-               if (*oidp == szp->z_id)         /* We're a descendant of szp */
+               if (oidp == szp->z_id)          /* We're a descendant of szp */
                        return (EINVAL);
 
-               if (*oidp == rootid)            /* We've hit the top */
+               if (oidp == rootid)             /* We've hit the top */
                        return (0);
 
                if (rw == RW_READER) {          /* i.e. not the first pass */
-                       int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+                       int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
                        if (error)
                                return (error);
                        zl->zl_znode = zp;
                }
-               oidp = &zp->z_phys->zp_parent;
+               (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
+                   &oidp, sizeof (oidp));
                rwlp = &zp->z_parent_lock;
                rw = RW_READER;
 
@@ -3075,7 +3296,7 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
        if (VOP_REALVP(tdvp, &realvp, ct) == 0)
                tdvp = realvp;
 
-       if (tdvp->v_vfsp != sdvp->v_vfsp) {
+       if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
                ZFS_EXIT(zfsvfs);
                return (EXDEV);
        }
@@ -3101,8 +3322,7 @@ top:
         * by renaming a linked file into/outof an attribute directory.
         * See the comment in zfs_link() for why this is considered bad.
         */
-       if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
-           (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+       if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
                ZFS_EXIT(zfsvfs);
                return (EINVAL);
        }
@@ -3168,6 +3388,15 @@ top:
                }
        }
 
+       /*
+        * If the source and destination directories are the same, we should
+        * grab the z_name_lock of that directory only once.
+        */
+       if (sdzp == tdzp) {
+               zflg |= ZHAVELOCK;
+               rw_enter(&sdzp->z_name_lock, RW_READER);
+       }
+
        if (cmp < 0) {
                serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
                    ZEXISTS | zflg, NULL, NULL);
@@ -3190,6 +3419,10 @@ top:
                        if (tzp)
                                VN_RELE(ZTOV(tzp));
                }
+
+               if (sdzp == tdzp)
+                       rw_exit(&sdzp->z_name_lock);
+
                if (strcmp(snm, "..") == 0)
                        serr = EINVAL;
                ZFS_EXIT(zfsvfs);
@@ -3198,6 +3431,10 @@ top:
        if (terr) {
                zfs_dirent_unlock(sdl);
                VN_RELE(ZTOV(szp));
+
+               if (sdzp == tdzp)
+                       rw_exit(&sdzp->z_name_lock);
+
                if (strcmp(tnm, "..") == 0)
                        terr = EINVAL;
                ZFS_EXIT(zfsvfs);
@@ -3265,14 +3502,20 @@ top:
        }
 
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, szp->z_id);       /* nlink changes */
-       dmu_tx_hold_bonus(tx, sdzp->z_id);      /* nlink changes */
+       dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+       dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
        dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
        dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
-       if (sdzp != tdzp)
-               dmu_tx_hold_bonus(tx, tdzp->z_id);      /* nlink changes */
-       if (tzp)
-               dmu_tx_hold_bonus(tx, tzp->z_id);       /* parent changes */
+       if (sdzp != tdzp) {
+               dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+               zfs_sa_upgrade_txholds(tx, tdzp);
+       }
+       if (tzp) {
+               dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+               zfs_sa_upgrade_txholds(tx, tzp);
+       }
+
+       zfs_sa_upgrade_txholds(tx, szp);
        dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
@@ -3280,6 +3523,10 @@ top:
                        zfs_rename_unlock(&zl);
                zfs_dirent_unlock(sdl);
                zfs_dirent_unlock(tdl);
+
+               if (sdzp == tdzp)
+                       rw_exit(&sdzp->z_name_lock);
+
                VN_RELE(ZTOV(szp));
                if (tzp)
                        VN_RELE(ZTOV(tzp));
@@ -3299,17 +3546,40 @@ top:
        if (error == 0) {
                error = zfs_link_create(tdl, szp, tx, ZRENAMING);
                if (error == 0) {
-                       szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
+                       szp->z_pflags |= ZFS_AV_MODIFIED;
 
-                       error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
-                       ASSERT(error == 0);
+                       error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+                           (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+                       ASSERT3U(error, ==, 0);
 
-                       zfs_log_rename(zilog, tx,
-                           TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
-                           sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+                       error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+                       if (error == 0) {
+                               zfs_log_rename(zilog, tx, TX_RENAME |
+                                   (flags & FIGNORECASE ? TX_CI : 0),
+                                   sdzp, sdl->dl_name, tdzp, tdl->dl_name,
+                                   szp);
 
-                       /* Update path information for the target vnode */
-                       vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
+                               /*
+                                * Update path information for the target vnode
+                                */
+                               vn_renamepath(tdvp, ZTOV(szp), tnm,
+                                   strlen(tnm));
+                       } else {
+                               /*
+                                * At this point, we have successfully created
+                                * the target name, but have failed to remove
+                                * the source name.  Since the create was done
+                                * with the ZRENAMING flag, there are
+                                * complications; for one, the link count is
+                                * wrong.  The easiest way to deal with this
+                                * is to remove the newly created target, and
+                                * return the original error.  This must
+                                * succeed; fortunately, it is very unlikely to
+                                * fail, since we just created it.
+                                */
+                               VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+                                   ZRENAMING, NULL), ==, 0);
+                       }
                }
        }
 
@@ -3321,10 +3591,17 @@ out:
        zfs_dirent_unlock(sdl);
        zfs_dirent_unlock(tdl);
 
+       if (sdzp == tdzp)
+               rw_exit(&sdzp->z_name_lock);
+
+
        VN_RELE(ZTOV(szp));
        if (tzp)
                VN_RELE(ZTOV(tzp));
 
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -3356,11 +3633,12 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
        dmu_tx_t        *tx;
        zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
        zilog_t         *zilog;
-       int             len = strlen(link);
+       uint64_t        len = strlen(link);
        int             error;
        int             zflg = ZNEW;
        zfs_acl_ids_t   acl_ids;
        boolean_t       fuid_dirtied;
+       uint64_t        txtype = TX_SYMLINK;
 
        ASSERT(vap->va_type == VLNK);
 
@@ -3375,27 +3653,35 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
        }
        if (flags & FIGNORECASE)
                zflg |= ZCILOOK;
-top:
-       if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-               ZFS_EXIT(zfsvfs);
-               return (error);
-       }
 
        if (len > MAXPATHLEN) {
                ZFS_EXIT(zfsvfs);
                return (ENAMETOOLONG);
        }
 
+       if ((error = zfs_acl_ids_create(dzp, 0,
+           vap, cr, NULL, &acl_ids)) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
+top:
        /*
         * Attempt to lock directory; fail if entry already exists.
         */
        error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
        if (error) {
+               zfs_acl_ids_free(&acl_ids);
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
+
+       if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+               zfs_acl_ids_free(&acl_ids);
+               zfs_dirent_unlock(dl);
                ZFS_EXIT(zfsvfs);
                return (error);
        }
 
-       VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
        if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
                zfs_acl_ids_free(&acl_ids);
                zfs_dirent_unlock(dl);
@@ -3405,70 +3691,56 @@ top:
        tx = dmu_tx_create(zfsvfs->z_os);
        fuid_dirtied = zfsvfs->z_fuid_dirty;
        dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
-       dmu_tx_hold_bonus(tx, dzp->z_id);
        dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-       if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
-               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+       dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+           ZFS_SA_BASE_ATTR_SIZE + len);
+       dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+       if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+                   acl_ids.z_aclp->z_acl_bytes);
+       }
        if (fuid_dirtied)
                zfs_fuid_txhold(zfsvfs, tx);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
-               zfs_acl_ids_free(&acl_ids);
                zfs_dirent_unlock(dl);
                if (error == ERESTART) {
                        dmu_tx_wait(tx);
                        dmu_tx_abort(tx);
                        goto top;
                }
+               zfs_acl_ids_free(&acl_ids);
                dmu_tx_abort(tx);
                ZFS_EXIT(zfsvfs);
                return (error);
        }
 
-       dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
        /*
         * Create a new object for the symlink.
-        * Put the link content into bonus buffer if it will fit;
-        * otherwise, store it just like any other file data.
+        * for version 4 ZPL datsets the symlink will be an SA attribute
         */
-       if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
-               zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
-               if (len != 0)
-                       bcopy(link, zp->z_phys + 1, len);
-       } else {
-               dmu_buf_t *dbp;
-
-               zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+       zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
-               if (fuid_dirtied)
-                       zfs_fuid_sync(zfsvfs, tx);
-               /*
-                * Nothing can access the znode yet so no locking needed
-                * for growing the znode's blocksize.
-                */
-               zfs_grow_blocksize(zp, len, tx);
-
-               VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
-                   zp->z_id, 0, FTAG, &dbp));
-               dmu_buf_will_dirty(dbp, tx);
+       if (fuid_dirtied)
+               zfs_fuid_sync(zfsvfs, tx);
 
-               ASSERT3U(len, <=, dbp->db_size);
-               bcopy(link, dbp->db_data, len);
-               dmu_buf_rele(dbp, FTAG);
-       }
-       zp->z_phys->zp_size = len;
+       if (zp->z_is_sa)
+               error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+                   link, len, tx);
+       else
+               zfs_sa_symlink(zp, link, len, tx);
 
+       zp->z_size = len;
+       (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+           &zp->z_size, sizeof (zp->z_size), tx);
        /*
         * Insert the new object into the directory.
         */
        (void) zfs_link_create(dl, zp, tx, ZNEW);
-       if (error == 0) {
-               uint64_t txtype = TX_SYMLINK;
-               if (flags & FIGNORECASE)
-                       txtype |= TX_CI;
-               zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
-       }
+
+       if (flags & FIGNORECASE)
+               txtype |= TX_CI;
+       zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 
        zfs_acl_ids_free(&acl_ids);
 
@@ -3478,6 +3750,9 @@ top:
 
        VN_RELE(ZTOV(zp));
 
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -3505,29 +3780,19 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
        znode_t         *zp = VTOZ(vp);
        zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
-       size_t          bufsz;
        int             error;
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
-       bufsz = (size_t)zp->z_phys->zp_size;
-       if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
-               error = uiomove(zp->z_phys + 1,
-                   MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-       } else {
-               dmu_buf_t *dbp;
-               error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
-               if (error) {
-                       ZFS_EXIT(zfsvfs);
-                       return (error);
-               }
-               error = uiomove(dbp->db_data,
-                   MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-               dmu_buf_rele(dbp, FTAG);
-       }
+       if (zp->z_is_sa)
+               error = sa_lookup_uio(zp->z_sa_hdl,
+                   SA_ZPL_SYMLINK(zfsvfs), uio);
+       else
+               error = zfs_sa_readlink(zp, uio);
 
        ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -3562,7 +3827,7 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
        vnode_t         *realvp;
        int             error;
        int             zf = ZNEW;
-       uid_t           owner;
+       uint64_t        parent;
 
        ASSERT(tdvp->v_type == VDIR);
 
@@ -3573,13 +3838,35 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
        if (VOP_REALVP(svp, &realvp, ct) == 0)
                svp = realvp;
 
-       if (svp->v_vfsp != tdvp->v_vfsp) {
+       /*
+        * POSIX dictates that we return EPERM here.
+        * Better choices include ENOTSUP or EISDIR.
+        */
+       if (svp->v_type == VDIR) {
+               ZFS_EXIT(zfsvfs);
+               return (EPERM);
+       }
+
+       if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
                ZFS_EXIT(zfsvfs);
                return (EXDEV);
        }
+
        szp = VTOZ(svp);
        ZFS_VERIFY_ZP(szp);
 
+       /* Prevent links to .zfs/shares files */
+
+       if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+           &parent, sizeof (uint64_t))) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
+       if (parent == zfsvfs->z_shares_dir) {
+               ZFS_EXIT(zfsvfs);
+               return (EPERM);
+       }
+
        if (zfsvfs->z_utf8 && u8_validate(name,
            strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
                ZFS_EXIT(zfsvfs);
@@ -3588,30 +3875,19 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
        if (flags & FIGNORECASE)
                zf |= ZCILOOK;
 
-top:
        /*
         * We do not support links between attributes and non-attributes
         * because of the potential security risk of creating links
         * into "normal" file space in order to circumvent restrictions
         * imposed in attribute space.
         */
-       if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
-           (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+       if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
                ZFS_EXIT(zfsvfs);
                return (EINVAL);
        }
 
-       /*
-        * POSIX dictates that we return EPERM here.
-        * Better choices include ENOTSUP or EISDIR.
-        */
-       if (svp->v_type == VDIR) {
-               ZFS_EXIT(zfsvfs);
-               return (EPERM);
-       }
 
-       owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
-       if (owner != crgetuid(cr) &&
+       if (szp->z_uid != crgetuid(cr) &&
            secpolicy_basic_link(cr) != 0) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
@@ -3622,6 +3898,7 @@ top:
                return (error);
        }
 
+top:
        /*
         * Attempt to lock directory; fail if entry already exists.
         */
@@ -3632,8 +3909,10 @@ top:
        }
 
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, szp->z_id);
+       dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
        dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+       zfs_sa_upgrade_txholds(tx, szp);
+       zfs_sa_upgrade_txholds(tx, dzp);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
                zfs_dirent_unlock(dl);
@@ -3664,6 +3943,9 @@ top:
                vnevent_link(svp, ct);
        }
 
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -3709,10 +3991,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
        dmu_tx_t        *tx;
        u_offset_t      off, koff;
        size_t          len, klen;
-       uint64_t        filesz;
        int             err;
 
-       filesz = zp->z_phys->zp_size;
        off = pp->p_offset;
        len = PAGESIZE;
        /*
@@ -3720,12 +4000,12 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
         * multiple pages so that we write a full block (thus avoiding
         * a read-modify-write).
         */
-       if (off < filesz && zp->z_blksz > PAGESIZE) {
+       if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
                klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
                koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
-               ASSERT(koff <= filesz);
-               if (koff + klen > filesz)
-                       klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
+               ASSERT(koff <= zp->z_size);
+               if (koff + klen > zp->z_size)
+                       klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
                pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
        }
        ASSERT3U(btop(len), ==, btopr(len));
@@ -3733,30 +4013,32 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
        /*
         * Can't push pages past end-of-file.
         */
-       if (off >= filesz) {
+       if (off >= zp->z_size) {
                /* ignore all pages */
                err = 0;
                goto out;
-       } else if (off + len > filesz) {
-               int npages = btopr(filesz - off);
+       } else if (off + len > zp->z_size) {
+               int npages = btopr(zp->z_size - off);
                page_t *trunc;
 
                page_list_break(&pp, &trunc, npages);
                /* ignore pages past end of file */
                if (trunc)
                        pvn_write_done(trunc, flags);
-               len = filesz - off;
+               len = zp->z_size - off;
        }
 
-       if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) ||
-           zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) {
+       if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+           zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
                err = EDQUOT;
                goto out;
        }
 top:
        tx = dmu_tx_create(zfsvfs->z_os);
        dmu_tx_hold_write(tx, zp->z_id, off, len);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+       zfs_sa_upgrade_txholds(tx, zp);
        err = dmu_tx_assign(tx, TXG_NOWAIT);
        if (err != 0) {
                if (err == ERESTART) {
@@ -3778,7 +4060,18 @@ top:
        }
 
        if (err == 0) {
-               zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+               uint64_t mtime[2], ctime[2];
+               sa_bulk_attr_t bulk[3];
+               int count = 0;
+
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+                   &mtime, 16);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+                   &ctime, 16);
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+                   &zp->z_pflags, 8);
+               zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+                   B_TRUE);
                zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
        }
        dmu_tx_commit(tx);
@@ -3854,14 +4147,14 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
        }
        rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
 
-       if (off > zp->z_phys->zp_size) {
+       if (off > zp->z_size) {
                /* past end of file */
                zfs_range_unlock(rl);
                ZFS_EXIT(zfsvfs);
                return (0);
        }
 
-       len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off);
+       len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
 
        for (off = io_off; io_off < off + len; io_off += io_len) {
                if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
@@ -3887,7 +4180,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
        }
 out:
        zfs_range_unlock(rl);
-       if ((flags & B_ASYNC) == 0)
+       if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -3902,7 +4195,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
        int error;
 
        rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-       if (zp->z_dbuf == NULL) {
+       if (zp->z_sa_hdl == NULL) {
                /*
                 * The fs has been unmounted, or we did a
                 * suspend/resume and this file no longer exists.
@@ -3913,7 +4206,10 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
                }
 
                mutex_enter(&zp->z_lock);
-               vp->v_count = 0; /* count arrives as 1 */
+               mutex_enter(&vp->v_lock);
+               ASSERT(vp->v_count == 1);
+               vp->v_count = 0;
+               mutex_exit(&vp->v_lock);
                mutex_exit(&zp->z_lock);
                rw_exit(&zfsvfs->z_teardown_inactive_lock);
                zfs_znode_free(zp);
@@ -3932,13 +4228,15 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
        if (zp->z_atime_dirty && zp->z_unlinked == 0) {
                dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
-               dmu_tx_hold_bonus(tx, zp->z_id);
+               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+               zfs_sa_upgrade_txholds(tx, zp);
                error = dmu_tx_assign(tx, TXG_WAIT);
                if (error) {
                        dmu_tx_abort(tx);
                } else {
-                       dmu_buf_will_dirty(zp->z_dbuf, tx);
                        mutex_enter(&zp->z_lock);
+                       (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+                           (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
                        zp->z_atime_dirty = 0;
                        mutex_exit(&zp->z_lock);
                        dmu_tx_commit(tx);
@@ -3980,7 +4278,6 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
 {
        znode_t *zp = VTOZ(vp);
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-       int error;
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
@@ -3991,13 +4288,12 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
         * return an error, but we don't worry about races between this
         * function and zfs_map().
         */
-       if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+       if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
                ZFS_EXIT(zfsvfs);
                return (EAGAIN);
        }
-       error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
        ZFS_EXIT(zfsvfs);
-       return (error);
+       return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
 }
 
 /*
@@ -4205,15 +4501,14 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
-       if ((prot & PROT_WRITE) &&
-           (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY |
-           ZFS_APPENDONLY))) {
+       if ((prot & PROT_WRITE) && (zp->z_pflags &
+           (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
        }
 
        if ((prot & (PROT_READ | PROT_EXEC)) &&
-           (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) {
+           (zp->z_pflags & ZFS_AV_QUARANTINED)) {
                ZFS_EXIT(zfsvfs);
                return (EACCES);
        }
@@ -4236,7 +4531,7 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
        /*
         * If file is locked, disallow mapping.
         */
-       if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
+       if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
                ZFS_EXIT(zfsvfs);
                return (EAGAIN);
        }
@@ -4382,13 +4677,21 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
        znode_t         *zp = VTOZ(vp);
        zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
        uint32_t        gen;
+       uint64_t        gen64;
        uint64_t        object = zp->z_id;
        zfid_short_t    *zfid;
-       int             size, i;
+       int             size, i, error;
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
-       gen = (uint32_t)zp->z_gen;
+
+       if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+           &gen64, sizeof (uint64_t))) != 0) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
+
+       gen = (uint32_t)gen64;
 
        size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
        if (fidp->fid_len < size) {
@@ -4488,6 +4791,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
                *valp = (ulong_t)SPA_MINBLOCKSIZE;
                return (0);
 
+       case _PC_TIMESTAMP_RESOLUTION:
+               /* nanosecond timestamp resolution */
+               *valp = 1L;
+               return (0);
+
        default:
                return (fs_pathconf(vp, cmd, valp, cr, ct));
        }
@@ -4520,15 +4828,179 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        int error;
        boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+       zilog_t *zilog = zfsvfs->z_log;
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
+
        error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+       if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               zil_commit(zilog, UINT64_MAX, 0);
+
        ZFS_EXIT(zfsvfs);
        return (error);
 }
 
 /*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ *                an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10); /* 1K */
+int zcr_blksz_max = (1 << 17); /* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+    caller_context_t *ct)
+{
+       znode_t *zp = VTOZ(vp);
+       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+       int max_blksz = zfsvfs->z_max_blksz;
+       uio_t *uio = &xuio->xu_uio;
+       ssize_t size = uio->uio_resid;
+       offset_t offset = uio->uio_loffset;
+       int blksz;
+       int fullblk, i;
+       arc_buf_t *abuf;
+       ssize_t maxsize;
+       int preamble, postamble;
+
+       if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+               return (EINVAL);
+
+       ZFS_ENTER(zfsvfs);
+       ZFS_VERIFY_ZP(zp);
+       switch (ioflag) {
+       case UIO_WRITE:
+               /*
+                * Loan out an arc_buf for write if write size is bigger than
+                * max_blksz, and the file's block size is also max_blksz.
+                */
+               blksz = max_blksz;
+               if (size < blksz || zp->z_blksz != blksz) {
+                       ZFS_EXIT(zfsvfs);
+                       return (EINVAL);
+               }
+               /*
+                * Caller requests buffers for write before knowing where the
+                * write offset might be (e.g. NFS TCP write).
+                */
+               if (offset == -1) {
+                       preamble = 0;
+               } else {
+                       preamble = P2PHASE(offset, blksz);
+                       if (preamble) {
+                               preamble = blksz - preamble;
+                               size -= preamble;
+                       }
+               }
+
+               postamble = P2PHASE(size, blksz);
+               size -= postamble;
+
+               fullblk = size / blksz;
+               (void) dmu_xuio_init(xuio,
+                   (preamble != 0) + fullblk + (postamble != 0));
+               DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+                   int, postamble, int,
+                   (preamble != 0) + fullblk + (postamble != 0));
+
+               /*
+                * Have to fix iov base/len for partial buffers.  They
+                * currently represent full arc_buf's.
+                */
+               if (preamble) {
+                       /* data begins in the middle of the arc_buf */
+                       abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+                           blksz);
+                       ASSERT(abuf);
+                       (void) dmu_xuio_add(xuio, abuf,
+                           blksz - preamble, preamble);
+               }
+
+               for (i = 0; i < fullblk; i++) {
+                       abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+                           blksz);
+                       ASSERT(abuf);
+                       (void) dmu_xuio_add(xuio, abuf, 0, blksz);
+               }
+
+               if (postamble) {
+                       /* data ends in the middle of the arc_buf */
+                       abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+                           blksz);
+                       ASSERT(abuf);
+                       (void) dmu_xuio_add(xuio, abuf, 0, postamble);
+               }
+               break;
+       case UIO_READ:
+               /*
+                * Loan out an arc_buf for read if the read size is larger than
+                * the current file block size.  Block alignment is not
+                * considered.  Partial arc_buf will be loaned out for read.
+                */
+               blksz = zp->z_blksz;
+               if (blksz < zcr_blksz_min)
+                       blksz = zcr_blksz_min;
+               if (blksz > zcr_blksz_max)
+                       blksz = zcr_blksz_max;
+               /* avoid potential complexity of dealing with it */
+               if (blksz > max_blksz) {
+                       ZFS_EXIT(zfsvfs);
+                       return (EINVAL);
+               }
+
+               maxsize = zp->z_size - uio->uio_loffset;
+               if (size > maxsize)
+                       size = maxsize;
+
+               if (size < blksz || vn_has_cached_data(vp)) {
+                       ZFS_EXIT(zfsvfs);
+                       return (EINVAL);
+               }
+               break;
+       default:
+               ZFS_EXIT(zfsvfs);
+               return (EINVAL);
+       }
+
+       uio->uio_extflg = UIO_XUIO;
+       XUIO_XUZC_RW(xuio) = ioflag;
+       ZFS_EXIT(zfsvfs);
+       return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+       int i;
+       arc_buf_t *abuf;
+       int ioflag = XUIO_XUZC_RW(xuio);
+
+       ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+       i = dmu_xuio_cnt(xuio);
+       while (i-- > 0) {
+               abuf = dmu_xuio_arcbuf(xuio, i);
+               /*
+                * if abuf == NULL, it must be a write buffer
+                * that has been returned in zfs_write().
+                */
+               if (abuf)
+                       dmu_return_arcbuf(abuf);
+               ASSERT(abuf || ioflag == UIO_WRITE);
+       }
+
+       dmu_xuio_fini(xuio);
+       return (0);
+}
+
+/*
  * Predeclare these here so that the compiler assumes that
  * this is an "old style" function declaration that does
  * not include arguments => we won't get type mismatch errors
@@ -4611,6 +5083,8 @@ const fs_operation_def_t zfs_fvnodeops_template[] = {
        VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
        VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
        VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
+       VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
+       VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
        NULL,                   NULL
 };
 
index f99e72f..24bd3dd 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -52,6 +51,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/kidmap.h>
 #endif /* _KERNEL */
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
 
 #include "zfs_prop.h"
+#include "zfs_comutil.h"
 
 /*
  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
@@ -131,7 +134,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
        avl_create(&zp->z_range_avl, zfs_range_compare,
            sizeof (rl_t), offsetof(rl_t, r_node));
 
-       zp->z_dbuf = NULL;
        zp->z_dirlocks = NULL;
        zp->z_acl_cached = NULL;
        return (0);
@@ -154,7 +156,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
        avl_destroy(&zp->z_range_avl);
        mutex_destroy(&zp->z_range_lock);
 
-       ASSERT(zp->z_dbuf == NULL);
        ASSERT(zp->z_dirlocks == NULL);
        ASSERT(zp->z_acl_cached == NULL);
 }
@@ -198,24 +199,26 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
        nzp->z_last_itx = ozp->z_last_itx;
        nzp->z_gen = ozp->z_gen;
        nzp->z_sync_cnt = ozp->z_sync_cnt;
-       nzp->z_phys = ozp->z_phys;
-       nzp->z_dbuf = ozp->z_dbuf;
+       nzp->z_is_sa = ozp->z_is_sa;
+       nzp->z_sa_hdl = ozp->z_sa_hdl;
+       bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
+       nzp->z_links = ozp->z_links;
+       nzp->z_size = ozp->z_size;
+       nzp->z_pflags = ozp->z_pflags;
+       nzp->z_uid = ozp->z_uid;
+       nzp->z_gid = ozp->z_gid;
+       nzp->z_mode = ozp->z_mode;
 
        /*
-        * Release any cached ACL, since it *may* have
-        * zfs_acl_node_t's that directly references an
-        * embedded ACL in the zp_acl of the old znode_phys_t
-        *
-        * It will be recached the next time the ACL is needed.
+        * Since this is just an idle znode and kmem is already dealing with
+        * memory pressure, release any cached ACL.
         */
        if (ozp->z_acl_cached) {
                zfs_acl_free(ozp->z_acl_cached);
                ozp->z_acl_cached = NULL;
        }
 
-       /* Update back pointers. */
-       (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
-           znode_evict_error);
+       sa_set_userp(nzp->z_sa_hdl, nzp);
 
        /*
         * Invalidate the original znode by clearing fields that provide a
@@ -223,7 +226,7 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
         * ensure that zfs_znode_move() recognizes the znode as invalid in any
         * subsequent callback.
         */
-       ozp->z_dbuf = NULL;
+       ozp->z_sa_hdl = NULL;
        POINTER_INVALIDATE(&ozp->z_zfsvfs);
 }
 
@@ -478,6 +481,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
        sharezp->z_unlinked = 0;
        sharezp->z_atime_dirty = 0;
        sharezp->z_zfsvfs = zfsvfs;
+       sharezp->z_is_sa = zfsvfs->z_use_sa;
 
        vp = ZTOV(sharezp);
        vn_reinit(vp);
@@ -485,8 +489,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 
        VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
            kcred, NULL, &acl_ids));
-       zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
-           &zp, 0, &acl_ids);
+       zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
        ASSERT3P(zp, ==, sharezp);
        ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
        POINTER_INVALIDATE(&sharezp->z_zfsvfs);
@@ -496,8 +499,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 
        zfs_acl_ids_free(&acl_ids);
        ZTOV(sharezp)->v_count = 0;
-       dmu_buf_rele(sharezp->z_dbuf, NULL);
-       sharezp->z_dbuf = NULL;
+       sa_handle_destroy(sharezp->z_sa_hdl);
        kmem_cache_free(znode_cache, sharezp);
 
        return (error);
@@ -561,25 +563,25 @@ zfs_cmpldev(uint64_t dev)
 }
 
 static void
-zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 {
-       znode_t         *nzp;
-
        ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
        ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
        mutex_enter(&zp->z_lock);
 
-       ASSERT(zp->z_dbuf == NULL);
-       zp->z_dbuf = db;
-       nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+       ASSERT(zp->z_sa_hdl == NULL);
+       ASSERT(zp->z_acl_cached == NULL);
+       if (sa_hdl == NULL) {
+               VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+                   SA_HDL_SHARED, &zp->z_sa_hdl));
+       } else {
+               zp->z_sa_hdl = sa_hdl;
+               sa_set_userp(sa_hdl, zp);
+       }
 
-       /*
-        * there should be no
-        * concurrent zgets on this object.
-        */
-       if (nzp != NULL)
-               panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
+       zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
        /*
         * Slap on VROOT if we are the root znode
@@ -594,14 +596,12 @@ zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
 void
 zfs_znode_dmu_fini(znode_t *zp)
 {
-       dmu_buf_t *db = zp->z_dbuf;
        ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
            zp->z_unlinked ||
            RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
-       ASSERT(zp->z_dbuf != NULL);
-       zp->z_dbuf = NULL;
-       VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
-       dmu_buf_rele(db, NULL);
+
+       sa_handle_destroy(zp->z_sa_hdl);
+       zp->z_sa_hdl = NULL;
 }
 
 /*
@@ -612,22 +612,27 @@ zfs_znode_dmu_fini(znode_t *zp)
  * return the znode
  */
 static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
 {
        znode_t *zp;
        vnode_t *vp;
+       uint64_t mode;
+       uint64_t parent;
+       uint64_t uid, gid;
+       sa_bulk_attr_t bulk[9];
+       int count = 0;
 
        zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 
        ASSERT(zp->z_dirlocks == NULL);
-       ASSERT(zp->z_dbuf == NULL);
        ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 
        /*
         * Defer setting z_zfsvfs until the znode is ready to be a candidate for
         * the zfs_znode_move() callback.
         */
-       zp->z_phys = NULL;
+       zp->z_sa_hdl = NULL;
        zp->z_unlinked = 0;
        zp->z_atime_dirty = 0;
        zp->z_mapcnt = 0;
@@ -640,16 +645,41 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
        vp = ZTOV(zp);
        vn_reinit(vp);
 
-       zfs_znode_dmu_init(zfsvfs, zp, db);
-
-       zp->z_gen = zp->z_phys->zp_gen;
+       zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+           &zp->z_size, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+           &zp->z_links, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+           &zp->z_atime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+           &uid, 8);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+           &gid, 8);
+
+       if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+               if (hdl == NULL)
+                       sa_handle_destroy(zp->z_sa_hdl);
+               kmem_cache_free(znode_cache, zp);
+               return (NULL);
+       }
 
+       zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
+       zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
+       zp->z_mode = mode;
        vp->v_vfsp = zfsvfs->z_parent->z_vfs;
-       vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+
+       vp->v_type = IFTOVT((mode_t)mode);
 
        switch (vp->v_type) {
        case VDIR:
-               if (zp->z_phys->zp_flags & ZFS_XATTR) {
+               if (zp->z_pflags & ZFS_XATTR) {
                        vn_setops(vp, zfs_xdvnodeops);
                        vp->v_flag |= V_XATTRDIR;
                } else {
@@ -659,7 +689,13 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
                break;
        case VBLK:
        case VCHR:
-               vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
+               {
+                       uint64_t rdev;
+                       VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
+                           &rdev, sizeof (rdev)) == 0);
+
+                       vp->v_rdev = zfs_cmpldev(rdev);
+               }
                /*FALLTHROUGH*/
        case VFIFO:
        case VSOCK:
@@ -668,10 +704,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
                break;
        case VREG:
                vp->v_flag |= VMODSORT;
-               if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir)
+               if (parent == zfsvfs->z_shares_dir) {
+                       ASSERT(uid == 0 && gid == 0);
                        vn_setops(vp, zfs_sharevnodeops);
-               else
+               } else {
                        vn_setops(vp, zfs_fvnodeops);
+               }
                break;
        case VLNK:
                vn_setops(vp, zfs_symvnodeops);
@@ -695,6 +733,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
        return (zp);
 }
 
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
 /*
  * Create a new DMU object to hold a zfs znode.
  *
@@ -705,7 +746,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
  *             flag    - flags:
  *                       IS_ROOT_NODE  - new object will be root
  *                       IS_XATTR      - new object is an attribute
- *                       IS_REPLAY     - intent log replay
  *             bonuslen - length of bonus buffer
  *             setaclp  - File/Dir initial ACL
  *             fuidp    - Tracks fuid allocation.
@@ -715,20 +755,28 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
+    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 {
-       dmu_buf_t       *db;
-       znode_phys_t    *pzp;
+       uint64_t        crtime[2], atime[2], mtime[2], ctime[2];
+       uint64_t        mode, size, links, parent, pflags;
+       uint64_t        dzp_pflags = 0;
+       uint64_t        rdev = 0;
        zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
+       dmu_buf_t       *db;
        timestruc_t     now;
        uint64_t        gen, obj;
        int             err;
+       int             bonuslen;
+       sa_handle_t     *sa_hdl;
+       dmu_object_type_t obj_type;
+       sa_bulk_attr_t  sa_attrs[ZPL_END];
+       int             cnt = 0;
+       zfs_acl_locator_cb_t locate = { 0 };
 
        ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
        if (zfsvfs->z_replay) {
                obj = vap->va_nodeid;
-               flag |= IS_REPLAY;
                now = vap->va_ctime;            /* see zfs_replay_create() */
                gen = vap->va_nblocks;          /* ditto */
        } else {
@@ -737,6 +785,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
                gen = dmu_tx_get_txg(tx);
        }
 
+       obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+       bonuslen = (obj_type == DMU_OT_SA) ?
+           DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+
        /*
         * Create a new DMU object.
         */
@@ -747,109 +799,215 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
         * assertions below.
         */
        if (vap->va_type == VDIR) {
-               if (flag & IS_REPLAY) {
+               if (zfsvfs->z_replay) {
                        err = zap_create_claim_norm(zfsvfs->z_os, obj,
                            zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-                           DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+                           obj_type, bonuslen, tx);
                        ASSERT3U(err, ==, 0);
                } else {
                        obj = zap_create_norm(zfsvfs->z_os,
                            zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-                           DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+                           obj_type, bonuslen, tx);
                }
        } else {
-               if (flag & IS_REPLAY) {
+               if (zfsvfs->z_replay) {
                        err = dmu_object_claim(zfsvfs->z_os, obj,
                            DMU_OT_PLAIN_FILE_CONTENTS, 0,
-                           DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+                           obj_type, bonuslen, tx);
                        ASSERT3U(err, ==, 0);
                } else {
                        obj = dmu_object_alloc(zfsvfs->z_os,
                            DMU_OT_PLAIN_FILE_CONTENTS, 0,
-                           DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+                           obj_type, bonuslen, tx);
                }
        }
-       VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
-       dmu_buf_will_dirty(db, tx);
 
-       /*
-        * Initialize the znode physical data to zero.
-        */
-       ASSERT(db->db_size >= sizeof (znode_phys_t));
-       bzero(db->db_data, db->db_size);
-       pzp = db->db_data;
+       ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+       VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
        /*
         * If this is the root, fix up the half-initialized parent pointer
         * to reference the just-allocated physical data area.
         */
        if (flag & IS_ROOT_NODE) {
-               dzp->z_dbuf = db;
-               dzp->z_phys = pzp;
                dzp->z_id = obj;
+       } else {
+               dzp_pflags = dzp->z_pflags;
        }
 
        /*
         * If parent is an xattr, so am I.
         */
-       if (dzp->z_phys->zp_flags & ZFS_XATTR)
+       if (dzp_pflags & ZFS_XATTR) {
                flag |= IS_XATTR;
-
-       if (vap->va_type == VBLK || vap->va_type == VCHR) {
-               pzp->zp_rdev = zfs_expldev(vap->va_rdev);
        }
 
        if (zfsvfs->z_use_fuids)
-               pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+               pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+       else
+               pflags = 0;
 
        if (vap->va_type == VDIR) {
-               pzp->zp_size = 2;               /* contents ("." and "..") */
-               pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+               size = 2;               /* contents ("." and "..") */
+               links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+       } else {
+               size = links = 0;
        }
 
-       pzp->zp_parent = dzp->z_id;
+       if (vap->va_type == VBLK || vap->va_type == VCHR) {
+               rdev = zfs_expldev(vap->va_rdev);
+       }
+
+       parent = dzp->z_id;
+       mode = acl_ids->z_mode;
        if (flag & IS_XATTR)
-               pzp->zp_flags |= ZFS_XATTR;
+               pflags |= ZFS_XATTR;
 
-       pzp->zp_gen = gen;
+       /*
+        * No execs denied will be deterimed when zfs_mode_compute() is called.
+        */
+       pflags |= acl_ids->z_aclp->z_hints &
+           (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+           ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 
-       ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
-       ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+       ZFS_TIME_ENCODE(&now, crtime);
+       ZFS_TIME_ENCODE(&now, ctime);
 
        if (vap->va_mask & AT_ATIME) {
-               ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+               ZFS_TIME_ENCODE(&vap->va_atime, atime);
        } else {
-               ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+               ZFS_TIME_ENCODE(&now, atime);
        }
 
        if (vap->va_mask & AT_MTIME) {
-               ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+               ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+       } else {
+               ZFS_TIME_ENCODE(&now, mtime);
+       }
+
+       /* Now add in all of the "SA" attributes */
+       VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+           &sa_hdl));
+
+       /*
+        * Setup the array of attributes to be replaced/set on the new file
+        *
+        * order for  DMU_OT_ZNODE is critical since it needs to be constructed
+        * in the old znode_phys_t format.  Don't change this ordering
+        */
+
+       if (obj_type == DMU_OT_ZNODE) {
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+                   NULL, &atime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+                   NULL, &mtime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+                   NULL, &ctime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+                   NULL, &crtime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+                   NULL, &gen, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+                   NULL, &mode, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+                   NULL, &size, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+                   NULL, &parent, 8);
        } else {
-               ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+                   NULL, &mode, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+                   NULL, &size, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+                   NULL, &gen, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+                   &acl_ids->z_fuid, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+                   &acl_ids->z_fgid, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+                   NULL, &parent, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+                   NULL, &pflags, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+                   NULL, &atime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+                   NULL, &mtime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+                   NULL, &ctime, 16);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+                   NULL, &crtime, 16);
+       }
+
+       SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+       if (obj_type == DMU_OT_ZNODE) {
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+                   &empty_xattr, 8);
        }
+       if (obj_type == DMU_OT_ZNODE ||
+           (vap->va_type == VBLK || vap->va_type == VCHR)) {
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+                   NULL, &rdev, 8);
+
+       }
+       if (obj_type == DMU_OT_ZNODE) {
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+                   NULL, &pflags, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+                   &acl_ids->z_fuid, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+                   &acl_ids->z_fgid, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+                   sizeof (uint64_t) * 4);
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+                   &acl_phys, sizeof (zfs_acl_phys_t));
+       } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+                   &acl_ids->z_aclp->z_acl_count, 8);
+               locate.cb_aclp = acl_ids->z_aclp;
+               SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+                   zfs_acl_data_locator, &locate,
+                   acl_ids->z_aclp->z_acl_bytes);
+               mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+                   acl_ids->z_fuid, acl_ids->z_fgid);
+       }
+
+       VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 
-       pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
        if (!(flag & IS_ROOT_NODE)) {
-               ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-               *zpp = zfs_znode_alloc(zfsvfs, db, 0);
-               ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+               *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+               ASSERT(*zpp != NULL);
        } else {
                /*
                 * If we are creating the root node, the "parent" we
                 * passed in is the znode for the root.
                 */
                *zpp = dzp;
+
+               (*zpp)->z_sa_hdl = sa_hdl;
        }
-       pzp->zp_uid = acl_ids->z_fuid;
-       pzp->zp_gid = acl_ids->z_fgid;
-       pzp->zp_mode = acl_ids->z_mode;
-       VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+
+       (*zpp)->z_pflags = pflags;
+       (*zpp)->z_mode = mode;
+
        if (vap->va_mask & AT_XVATTR)
-               zfs_xvattr_set(*zpp, (xvattr_t *)vap);
+               zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+       if (obj_type == DMU_OT_ZNODE ||
+           acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+               err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
+               ASSERT3P(err, ==, 0);
+       }
+       ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
+/*
+ * zfs_xvattr_set only updates the in-core attributes
+ * it is assumed the caller will be doing an sa_bulk_update
+ * to push the changes out
+ */
 void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 {
        xoptattr_t *xoap;
 
@@ -857,60 +1015,76 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
        ASSERT(xoap);
 
        if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
-               ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+               uint64_t times[2];
+               ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+               (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+                   &times, sizeof (times), tx);
                XVA_SET_RTN(xvap, XAT_CREATETIME);
        }
        if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
-               ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+               ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_READONLY);
        }
        if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
-               ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+               ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_HIDDEN);
        }
        if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
-               ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+               ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_SYSTEM);
        }
        if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
-               ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+               ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_ARCHIVE);
        }
        if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
-               ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+               ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_IMMUTABLE);
        }
        if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
-               ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+               ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_NOUNLINK);
        }
        if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
-               ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+               ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_APPENDONLY);
        }
        if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
-               ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+               ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_NODUMP);
        }
        if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
-               ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+               ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_OPAQUE);
        }
        if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
                ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
-                   xoap->xoa_av_quarantined);
+                   xoap->xoa_av_quarantined, zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
        }
        if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
-               ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+               ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+                   zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
        }
        if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
-               (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
-                   sizeof (xoap->xoa_av_scanstamp));
-               zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+               zfs_sa_set_scanstamp(zp, xvap, tx);
                XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
        }
+       if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+               ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+                   zp->z_pflags, tx);
+               XVA_SET_RTN(xvap, XAT_REPARSE);
+       }
 }
 
 int
@@ -920,35 +1094,42 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
        dmu_buf_t       *db;
        znode_t         *zp;
        int err;
+       sa_handle_t     *hdl;
 
        *zpp = NULL;
 
        ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-       err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+       err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
        if (err) {
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (err);
        }
 
        dmu_object_info_from_db(db, &doi);
-       if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-           doi.doi_bonus_size < sizeof (znode_phys_t)) {
-               dmu_buf_rele(db, NULL);
+       if (doi.doi_bonus_type != DMU_OT_SA &&
+           (doi.doi_bonus_type != DMU_OT_ZNODE ||
+           (doi.doi_bonus_type == DMU_OT_ZNODE &&
+           doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+               sa_buf_rele(db, NULL);
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (EINVAL);
        }
 
-       zp = dmu_buf_get_user(db);
-       if (zp != NULL) {
-               mutex_enter(&zp->z_lock);
+       hdl = dmu_buf_get_user(db);
+       if (hdl != NULL) {
+               zp  = sa_get_userdata(hdl);
+
 
                /*
-                * Since we do immediate eviction of the z_dbuf, we
-                * should never find a dbuf with a znode that doesn't
-                * know about the dbuf.
+                * Since "SA" does immediate eviction we
+                * should never find a sa handle that doesn't
+                * know about the znode.
                 */
-               ASSERT3P(zp->z_dbuf, ==, db);
+
+               ASSERT3P(zp, !=, NULL);
+
+               mutex_enter(&zp->z_lock);
                ASSERT3U(zp->z_id, ==, obj_num);
                if (zp->z_unlinked) {
                        err = ENOENT;
@@ -957,7 +1138,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
                        *zpp = zp;
                        err = 0;
                }
-               dmu_buf_rele(db, NULL);
+               sa_buf_rele(db, NULL);
                mutex_exit(&zp->z_lock);
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (err);
@@ -965,11 +1146,24 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 
        /*
         * Not found create new znode/vnode
+        * but only if file exists.
+        *
+        * There is a small window where zfs_vget() could
+        * find this object while a file create is still in
+        * progress.  This is checked for in zfs_znode_alloc()
+        *
+        * if zfs_znode_alloc() fails it will drop the hold on the
+        * bonus buffer.
         */
-       zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+       zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+           doi.doi_bonus_type, NULL);
+       if (zp == NULL) {
+               err = ENOENT;
+       } else {
+               *zpp = zp;
+       }
        ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-       *zpp = zp;
-       return (0);
+       return (err);
 }
 
 int
@@ -979,32 +1173,76 @@ zfs_rezget(znode_t *zp)
        dmu_object_info_t doi;
        dmu_buf_t *db;
        uint64_t obj_num = zp->z_id;
+       uint64_t mode;
+       uint64_t uid, gid;
+       sa_bulk_attr_t bulk[8];
        int err;
+       int count = 0;
+       uint64_t gen;
 
        ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-       err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+       mutex_enter(&zp->z_acl_lock);
+       if (zp->z_acl_cached) {
+               zfs_acl_free(zp->z_acl_cached);
+               zp->z_acl_cached = NULL;
+       }
+
+       mutex_exit(&zp->z_acl_lock);
+       ASSERT(zp->z_sa_hdl == NULL);
+       err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
        if (err) {
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (err);
        }
 
        dmu_object_info_from_db(db, &doi);
-       if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-           doi.doi_bonus_size < sizeof (znode_phys_t)) {
-               dmu_buf_rele(db, NULL);
+       if (doi.doi_bonus_type != DMU_OT_SA &&
+           (doi.doi_bonus_type != DMU_OT_ZNODE ||
+           (doi.doi_bonus_type == DMU_OT_ZNODE &&
+           doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+               sa_buf_rele(db, NULL);
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (EINVAL);
        }
 
-       if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
-               dmu_buf_rele(db, NULL);
+       zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+
+       /* reload cached values */
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+           &gen, sizeof (gen));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+           &zp->z_size, sizeof (zp->z_size));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+           &zp->z_links, sizeof (zp->z_links));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+           &zp->z_pflags, sizeof (zp->z_pflags));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+           &zp->z_atime, sizeof (zp->z_atime));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+           &uid, sizeof (uid));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+           &gid, sizeof (gid));
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+           &mode, sizeof (mode));
+
+       zp->z_mode = mode;
+
+       if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+               zfs_znode_dmu_fini(zp);
+               ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+               return (EIO);
+       }
+
+       if (gen != zp->z_gen) {
+               zfs_znode_dmu_fini(zp);
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (EIO);
        }
 
-       zfs_znode_dmu_init(zfsvfs, zp, db);
-       zp->z_unlinked = (zp->z_phys->zp_links == 0);
+       zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
+       zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
+       zp->z_unlinked = (zp->z_links == 0);
        zp->z_blksz = doi.doi_data_block_size;
 
        ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1018,7 +1256,7 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        objset_t *os = zfsvfs->z_os;
        uint64_t obj = zp->z_id;
-       uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+       uint64_t acl_obj = ZFS_EXTERNAL_ACL(zp);
 
        ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
        if (acl_obj)
@@ -1036,7 +1274,7 @@ zfs_zinactive(znode_t *zp)
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        uint64_t z_id = zp->z_id;
 
-       ASSERT(zp->z_dbuf && zp->z_phys);
+       ASSERT(zp->z_sa_hdl);
 
        /*
         * Don't allow a zfs_zget() while were trying to release this znode
@@ -1075,6 +1313,7 @@ zfs_zinactive(znode_t *zp)
                zfs_rmnode(zp);
                return;
        }
+
        mutex_exit(&zp->z_lock);
        zfs_znode_dmu_fini(zp);
        ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
@@ -1106,59 +1345,40 @@ zfs_znode_free(znode_t *zp)
 }
 
 void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2], boolean_t have_tx)
 {
        timestruc_t     now;
 
-       ASSERT(MUTEX_HELD(&zp->z_lock));
-
        gethrestime(&now);
 
-       if (tx) {
-               dmu_buf_will_dirty(zp->z_dbuf, tx);
+       if (have_tx) {  /* will sa_bulk_update happen really soon? */
                zp->z_atime_dirty = 0;
                zp->z_seq++;
        } else {
                zp->z_atime_dirty = 1;
        }
 
-       if (flag & AT_ATIME)
-               ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+       if (flag & AT_ATIME) {
+               ZFS_TIME_ENCODE(&now, zp->z_atime);
+       }
 
        if (flag & AT_MTIME) {
-               ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
-               if (zp->z_zfsvfs->z_use_fuids)
-                       zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+               ZFS_TIME_ENCODE(&now, mtime);
+               if (zp->z_zfsvfs->z_use_fuids) {
+                       zp->z_pflags |= (ZFS_ARCHIVE |
+                           ZFS_AV_MODIFIED);
+               }
        }
 
        if (flag & AT_CTIME) {
-               ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+               ZFS_TIME_ENCODE(&now, ctime);
                if (zp->z_zfsvfs->z_use_fuids)
-                       zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+                       zp->z_pflags |= ZFS_ARCHIVE;
        }
 }
 
 /*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- *  1 - Only the ACCESS time is ever updated outside of a transaction.
- *  2 - Multiple consecutive updates will be collapsed into a single
- *     znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
-       mutex_enter(&zp->z_lock);
-       zfs_time_stamper_locked(zp, flag, tx);
-       mutex_exit(&zp->z_lock);
-}
-
-/*
  * Grow the block size for a file.
  *
  *     IN:     zp      - znode of file to free data in.
@@ -1180,17 +1400,18 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
         * we will not grow.  If there is more than one block in a file,
         * the blocksize cannot change.
         */
-       if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+       if (zp->z_blksz && zp->z_size > zp->z_blksz)
                return;
 
        error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
            size, 0, tx);
+
        if (error == ENOTSUP)
                return;
        ASSERT3U(error, ==, 0);
 
        /* What blocksize did we actually get? */
-       dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+       dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
 /*
@@ -1233,13 +1454,14 @@ zfs_extend(znode_t *zp, uint64_t end)
        /*
         * Nothing to do if file already at desired length.
         */
-       if (end <= zp->z_phys->zp_size) {
+       if (end <= zp->z_size) {
                zfs_range_unlock(rl);
                return (0);
        }
 top:
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+       zfs_sa_upgrade_txholds(tx, zp);
        if (end > zp->z_blksz &&
            (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
                /*
@@ -1267,12 +1489,14 @@ top:
                zfs_range_unlock(rl);
                return (error);
        }
-       dmu_buf_will_dirty(zp->z_dbuf, tx);
 
        if (newblksz)
                zfs_grow_blocksize(zp, newblksz, tx);
 
-       zp->z_phys->zp_size = end;
+       zp->z_size = end;
+
+       VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+           &zp->z_size, sizeof (zp->z_size), tx));
 
        zfs_range_unlock(rl);
 
@@ -1306,13 +1530,13 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
        /*
         * Nothing to do if file already at desired length.
         */
-       if (off >= zp->z_phys->zp_size) {
+       if (off >= zp->z_size) {
                zfs_range_unlock(rl);
                return (0);
        }
 
-       if (off + len > zp->z_phys->zp_size)
-               len = zp->z_phys->zp_size - off;
+       if (off + len > zp->z_size)
+               len = zp->z_size - off;
 
        error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
 
@@ -1347,7 +1571,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
        /*
         * Nothing to do if file already at desired length.
         */
-       if (end >= zp->z_phys->zp_size) {
+       if (end >= zp->z_size) {
                zfs_range_unlock(rl);
                return (0);
        }
@@ -1359,7 +1583,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
        }
 top:
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+       zfs_sa_upgrade_txholds(tx, zp);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
                if (error == ERESTART) {
@@ -1371,9 +1596,11 @@ top:
                zfs_range_unlock(rl);
                return (error);
        }
-       dmu_buf_will_dirty(zp->z_dbuf, tx);
 
-       zp->z_phys->zp_size = end;
+       zp->z_size = end;
+
+       VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+           &zp->z_size, sizeof (zp->z_size), tx));
 
        dmu_tx_commit(tx);
 
@@ -1425,9 +1652,17 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
        dmu_tx_t *tx;
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        zilog_t *zilog = zfsvfs->z_log;
+       uint64_t mode;
+       uint64_t mtime[2], ctime[2];
+       sa_bulk_attr_t bulk[3];
+       int count = 0;
        int error;
 
-       if (off > zp->z_phys->zp_size) {
+       if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+           sizeof (mode))) != 0)
+               return (error);
+
+       if (off > zp->z_size) {
                error =  zfs_extend(zp, off+len);
                if (error == 0 && log)
                        goto log;
@@ -1438,8 +1673,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
        /*
         * Check for any locks in the region to be freed.
         */
-       if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
-               uint64_t length = (len ? len : zp->z_phys->zp_size - off);
+
+       if (MANDLOCK(vp, (mode_t)mode)) {
+               uint64_t length = (len ? len : zp->z_size - off);
                if (error = chklock(vp, FWRITE, off, length, flag, NULL))
                        return (error);
        }
@@ -1448,14 +1684,15 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
                error = zfs_trunc(zp, off);
        } else {
                if ((error = zfs_free_range(zp, off, len)) == 0 &&
-                   off + len > zp->z_phys->zp_size)
+                   off + len > zp->z_size)
                        error = zfs_extend(zp, off+len);
        }
        if (error || !log)
                return (error);
 log:
        tx = dmu_tx_create(zfsvfs->z_os);
-       dmu_tx_hold_bonus(tx, zp->z_id);
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+       zfs_sa_upgrade_txholds(tx, zp);
        error = dmu_tx_assign(tx, TXG_NOWAIT);
        if (error) {
                if (error == ERESTART) {
@@ -1467,7 +1704,14 @@ log:
                return (error);
        }
 
-       zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+           NULL, &zp->z_pflags, 8);
+       zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+       error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+       ASSERT(error == 0);
+
        zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
        dmu_tx_commit(tx);
@@ -1478,11 +1722,12 @@ void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
        zfsvfs_t        zfsvfs;
-       uint64_t        moid, obj, version;
+       uint64_t        moid, obj, sa_obj, version;
        uint64_t        sense = ZFS_CASE_SENSITIVE;
        uint64_t        norm = 0;
        nvpair_t        *elem;
        int             error;
+       int             i;
        znode_t         *rootzp = NULL;
        vnode_t         *vp;
        vattr_t         vattr;
@@ -1504,12 +1749,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        /*
         * Set starting attributes.
         */
-       if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
-               version = ZPL_VERSION;
-       else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
-               version = ZPL_VERSION_USERSPACE - 1;
-       else
-               version = ZPL_VERSION_FUID - 1;
+       version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
        elem = NULL;
        while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
                /* For the moment we expect all zpl props to be uint64_ts */
@@ -1535,6 +1775,18 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 
        /*
+        * Create zap object used for SA attribute registration
+        */
+
+       if (version >= ZPL_VERSION_SA) {
+               sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+                   DMU_OT_NONE, 0, tx);
+               error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+               ASSERT(error == 0);
+       } else {
+               sa_obj = 0;
+       }
+       /*
         * Create a delete queue.
         */
        obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
@@ -1555,6 +1807,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
        rootzp->z_unlinked = 0;
        rootzp->z_atime_dirty = 0;
+       rootzp->z_is_sa = USE_SA(version, os);
 
        vp = ZTOV(rootzp);
        vn_reinit(vp);
@@ -1566,7 +1819,11 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        zfsvfs.z_parent = &zfsvfs;
        zfsvfs.z_version = version;
        zfsvfs.z_use_fuids = USE_FUIDS(version, os);
+       zfsvfs.z_use_sa = USE_SA(version, os);
        zfsvfs.z_norm = norm;
+
+       zfsvfs.z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+
        /*
         * Fold case on file systems that are always or sometimes case
         * insensitive.
@@ -1578,11 +1835,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
 
+       for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+               mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
        ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
        rootzp->z_zfsvfs = &zfsvfs;
        VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
            cr, NULL, &acl_ids));
-       zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
+       zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
        ASSERT3P(zp, ==, rootzp);
        ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
        error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
@@ -1591,8 +1851,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
        ZTOV(rootzp)->v_count = 0;
-       dmu_buf_rele(rootzp->z_dbuf, NULL);
-       rootzp->z_dbuf = NULL;
+       sa_handle_destroy(rootzp->z_sa_hdl);
        kmem_cache_free(znode_cache, rootzp);
 
        /*
@@ -1602,36 +1861,65 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        error = zfs_create_share_dir(&zfsvfs, tx);
 
        ASSERT(error == 0);
+
+       for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+               mutex_destroy(&zfsvfs.z_hold_mtx[i]);
 }
 
 #endif /* _KERNEL */
+
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
  */
 static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir,
+    sa_attr_type_t *sa_table)
 {
        dmu_buf_t *db;
        dmu_object_info_t doi;
-       znode_phys_t *zp;
        int error;
-
-       if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+       uint64_t parent;
+       uint64_t pflags;
+       uint64_t mode;
+       sa_bulk_attr_t bulk[3];
+       sa_handle_t *hdl;
+       int count = 0;
+
+       if ((error = sa_buf_hold(osp, obj, FTAG, &db)) != 0)
                return (error);
 
        dmu_object_info_from_db(db, &doi);
-       if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+       if ((doi.doi_bonus_type != DMU_OT_SA &&
+           doi.doi_bonus_type != DMU_OT_ZNODE) ||
+           doi.doi_bonus_type == DMU_OT_ZNODE &&
            doi.doi_bonus_size < sizeof (znode_phys_t)) {
-               dmu_buf_rele(db, FTAG);
+               sa_buf_rele(db, FTAG);
                return (EINVAL);
        }
 
-       zp = db->db_data;
-       *pobjp = zp->zp_parent;
-       *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
-           S_ISDIR(zp->zp_mode);
-       dmu_buf_rele(db, FTAG);
+       if ((error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE,
+           &hdl)) != 0) {
+               sa_buf_rele(db, FTAG);
+               return (error);
+       }
+
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT],
+           NULL, &parent, 8);
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+           &pflags, 8);
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+           &mode, 8);
+
+       if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) {
+               sa_buf_rele(db, FTAG);
+               sa_handle_destroy(hdl);
+               return (error);
+       }
+       *pobjp = parent;
+       *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+       sa_handle_destroy(hdl);
+       sa_buf_rele(db, FTAG);
 
        return (0);
 }
@@ -1640,10 +1928,19 @@ int
 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 {
        char *path = buf + len - 1;
+       sa_attr_type_t *sa_table;
        int error;
+       uint64_t sa_obj = 0;
 
        *path = '\0';
 
+       error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+       if (error != 0 && error != ENOENT)
+               return (error);
+
+       sa_table = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END);
+
        for (;;) {
                uint64_t pobj;
                char component[MAXNAMELEN + 2];
@@ -1651,7 +1948,7 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
                int is_xattrdir;
 
                if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
-                   &is_xattrdir)) != 0)
+                   &is_xattrdir, sa_table)) != 0)
                        break;
 
                if (pobj == obj) {
@@ -1679,5 +1976,6 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 
        if (error == 0)
                (void) memmove(buf, path, buf + len - path);
+
        return (error);
 }
index db3822f..4aa4d10 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
@@ -36,6 +36,7 @@
 #include <sys/dsl_dataset.h>
 #include <sys/vdev.h>
 #include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
@@ -66,7 +67,7 @@
 /*
  * This global ZIL switch affects all pools
  */
-int zil_disable = 0;   /* disable intent logging */
+int zil_replay_disable = 0;    /* disable intent logging replay */
 
 /*
  * Tunable parameter for debugging or performance analysis.  Setting
@@ -77,11 +78,17 @@ boolean_t zfs_nocacheflush = B_FALSE;
 
 static kmem_cache_t *zil_lwb_cache;
 
+static boolean_t zil_empty(zilog_t *zilog);
+
+#define        LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+
 static int
-zil_dva_compare(const void *x1, const void *x2)
+zil_bp_compare(const void *x1, const void *x2)
 {
-       const dva_t *dva1 = x1;
-       const dva_t *dva2 = x2;
+       const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+       const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
        if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
                return (-1);
@@ -97,34 +104,37 @@ zil_dva_compare(const void *x1, const void *x2)
 }
 
 static void
-zil_dva_tree_init(avl_tree_t *t)
+zil_bp_tree_init(zilog_t *zilog)
 {
-       avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
-           offsetof(zil_dva_node_t, zn_node));
+       avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+           sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
-zil_dva_tree_fini(avl_tree_t *t)
+zil_bp_tree_fini(zilog_t *zilog)
 {
-       zil_dva_node_t *zn;
+       avl_tree_t *t = &zilog->zl_bp_tree;
+       zil_bp_node_t *zn;
        void *cookie = NULL;
 
        while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
-               kmem_free(zn, sizeof (zil_dva_node_t));
+               kmem_free(zn, sizeof (zil_bp_node_t));
 
        avl_destroy(t);
 }
 
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
-       zil_dva_node_t *zn;
+       avl_tree_t *t = &zilog->zl_bp_tree;
+       const dva_t *dva = BP_IDENTITY(bp);
+       zil_bp_node_t *zn;
        avl_index_t where;
 
        if (avl_find(t, dva, &where) != NULL)
                return (EEXIST);
 
-       zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+       zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
        zn->zn_dva = *dva;
        avl_insert(t, zn, where);
 
@@ -149,35 +159,31 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 }
 
 /*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
+ * Read a log block and make sure it's valid.
  */
 static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+    char **end)
 {
-       blkptr_t blk = *bp;
-       zbookmark_t zb;
+       enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
        uint32_t aflags = ARC_WAIT;
+       arc_buf_t *abuf = NULL;
+       zbookmark_t zb;
        int error;
 
-       zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-       zb.zb_object = 0;
-       zb.zb_level = -1;
-       zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+       if (zilog->zl_header->zh_claim_txg == 0)
+               zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
-       *abufpp = NULL;
+       if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+               zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-       /*
-        * We shouldn't be doing any scrubbing while we're doing log
-        * replay, it's OK to not lock.
-        */
-       error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
-           arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
-           ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+       SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+           ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+       error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+           ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
        if (error == 0) {
-               char *data = (*abufpp)->b_data;
-               uint64_t blksz = BP_GET_LSIZE(bp);
-               zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
                zio_cksum_t cksum = bp->blk_cksum;
 
                /*
@@ -190,43 +196,102 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
                 */
                cksum.zc_word[ZIL_ZC_SEQ]++;
 
-               if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
-                   sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
-                   (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
-                       error = ECKSUM;
-               }
+               if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+                       zil_chain_t *zilc = abuf->b_data;
+                       char *lr = (char *)(zilc + 1);
+                       uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
 
-               if (error) {
-                       VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
-                       *abufpp = NULL;
+                       if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+                           sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+                               error = ECKSUM;
+                       } else {
+                               bcopy(lr, dst, len);
+                               *end = (char *)dst + len;
+                               *nbp = zilc->zc_next_blk;
+                       }
+               } else {
+                       char *lr = abuf->b_data;
+                       uint64_t size = BP_GET_LSIZE(bp);
+                       zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+                       if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+                           sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+                           (zilc->zc_nused > (size - sizeof (*zilc)))) {
+                               error = ECKSUM;
+                       } else {
+                               bcopy(lr, dst, zilc->zc_nused);
+                               *end = (char *)dst + zilc->zc_nused;
+                               *nbp = zilc->zc_next_blk;
+                       }
                }
+
+               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+       }
+
+       return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+       enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+       const blkptr_t *bp = &lr->lr_blkptr;
+       uint32_t aflags = ARC_WAIT;
+       arc_buf_t *abuf = NULL;
+       zbookmark_t zb;
+       int error;
+
+       if (BP_IS_HOLE(bp)) {
+               if (wbuf != NULL)
+                       bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+               return (0);
        }
 
-       dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+       if (zilog->zl_header->zh_claim_txg == 0)
+               zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+       SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+           ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+       error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+           ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+       if (error == 0) {
+               if (wbuf != NULL)
+                       bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+               (void) arc_buf_remove_ref(abuf, &abuf);
+       }
 
        return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
  */
-uint64_t
+int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
 {
        const zil_header_t *zh = zilog->zl_header;
-       uint64_t claim_seq = zh->zh_claim_seq;
-       uint64_t seq = 0;
-       uint64_t max_seq = 0;
-       blkptr_t blk = zh->zh_log;
-       arc_buf_t *abuf;
+       boolean_t claimed = !!zh->zh_claim_txg;
+       uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+       uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+       uint64_t max_blk_seq = 0;
+       uint64_t max_lr_seq = 0;
+       uint64_t blk_count = 0;
+       uint64_t lr_count = 0;
+       blkptr_t blk, next_blk;
        char *lrbuf, *lrp;
-       zil_trailer_t *ztp;
-       int reclen, error;
+       int error = 0;
 
-       if (BP_IS_HOLE(&blk))
-               return (max_seq);
+       /*
+        * Old logs didn't record the maximum zh_claim_lr_seq.
+        */
+       if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+               claim_lr_seq = UINT64_MAX;
 
        /*
         * Starting at the block pointed to by zh_log we read the log chain.
@@ -237,105 +302,156 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
         * If the log has been claimed, stop if we encounter a sequence
         * number greater than the highest claimed sequence number.
         */
-       zil_dva_tree_init(&zilog->zl_dva_tree);
-       for (;;) {
-               seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+       lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+       zil_bp_tree_init(zilog);
 
-               if (claim_seq != 0 && seq > claim_seq)
-                       break;
-
-               ASSERT(max_seq < seq);
-               max_seq = seq;
+       for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+               uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+               int reclen;
+               char *end;
 
-               error = zil_read_log_block(zilog, &blk, &abuf);
+               if (blk_seq > claim_blk_seq)
+                       break;
+               if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+                       break;
+               ASSERT3U(max_blk_seq, <, blk_seq);
+               max_blk_seq = blk_seq;
+               blk_count++;
 
-               if (parse_blk_func != NULL)
-                       parse_blk_func(zilog, &blk, arg, txg);
+               if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+                       break;
 
+               error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
                if (error)
                        break;
 
-               lrbuf = abuf->b_data;
-               ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-               blk = ztp->zit_next_blk;
-
-               if (parse_lr_func == NULL) {
-                       VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-                       continue;
-               }
-
-               for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+               for (lrp = lrbuf; lrp < end; lrp += reclen) {
                        lr_t *lr = (lr_t *)lrp;
                        reclen = lr->lrc_reclen;
                        ASSERT3U(reclen, >=, sizeof (lr_t));
-                       parse_lr_func(zilog, lr, arg, txg);
+                       if (lr->lrc_seq > claim_lr_seq)
+                               goto done;
+                       if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+                               goto done;
+                       ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+                       max_lr_seq = lr->lrc_seq;
+                       lr_count++;
                }
-               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
        }
-       zil_dva_tree_fini(&zilog->zl_dva_tree);
+done:
+       zilog->zl_parse_error = error;
+       zilog->zl_parse_blk_seq = max_blk_seq;
+       zilog->zl_parse_lr_seq = max_lr_seq;
+       zilog->zl_parse_blk_count = blk_count;
+       zilog->zl_parse_lr_count = lr_count;
+
+       ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+           (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+
+       zil_bp_tree_fini(zilog);
+       zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
 
-       return (max_seq);
+       return (error);
 }
 
-/* ARGSUSED */
-static void
+static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
-       spa_t *spa = zilog->zl_spa;
-       int err;
-
        /*
         * Claim log block if not already committed and not already claimed.
+        * If tx == NULL, just verify that the block is claimable.
         */
-       if (bp->blk_birth >= first_txg &&
-           zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
-               err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
-                   ZIO_FLAG_MUSTSUCCEED));
-               ASSERT(err == 0);
-       }
+       if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+               return (0);
+
+       return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+           tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
-static void
+static int
 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 {
-       if (lrc->lrc_txtype == TX_WRITE) {
-               lr_write_t *lr = (lr_write_t *)lrc;
-               zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
-       }
+       lr_write_t *lr = (lr_write_t *)lrc;
+       int error;
+
+       if (lrc->lrc_txtype != TX_WRITE)
+               return (0);
+
+       /*
+        * If the block is not readable, don't claim it.  This can happen
+        * in normal operation when a log block is written to disk before
+        * some of the dmu_sync() blocks it points to.  In this case, the
+        * transaction cannot have been committed to anyone (we would have
+        * waited for all writes to be stable first), so it is semantically
+        * correct to declare this the end of the log.
+        */
+       if (lr->lr_blkptr.blk_birth >= first_txg &&
+           (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+               return (error);
+       return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 /* ARGSUSED */
-static void
+static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
-       zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+       zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+       return (0);
 }
 
-static void
+static int
 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 {
+       lr_write_t *lr = (lr_write_t *)lrc;
+       blkptr_t *bp = &lr->lr_blkptr;
+
        /*
         * If we previously claimed it, we need to free it.
         */
-       if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
-               lr_write_t *lr = (lr_write_t *)lrc;
-               blkptr_t *bp = &lr->lr_blkptr;
-               if (bp->blk_birth >= claim_txg &&
-                   !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
-                       (void) arc_free(NULL, zilog->zl_spa,
-                           dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
-               }
+       if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+           bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+               zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+       return (0);
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+{
+       lwb_t *lwb;
+
+       lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+       lwb->lwb_zilog = zilog;
+       lwb->lwb_blk = *bp;
+       lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+       lwb->lwb_max_txg = txg;
+       lwb->lwb_zio = NULL;
+       lwb->lwb_tx = NULL;
+       if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+               lwb->lwb_nused = sizeof (zil_chain_t);
+               lwb->lwb_sz = BP_GET_LSIZE(bp);
+       } else {
+               lwb->lwb_nused = 0;
+               lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
        }
+
+       mutex_enter(&zilog->zl_lock);
+       list_insert_tail(&zilog->zl_lwb_list, lwb);
+       mutex_exit(&zilog->zl_lock);
+
+       return (lwb);
 }
 
 /*
  * Create an on-disk intent log.
  */
-static void
+static lwb_t *
 zil_create(zilog_t *zilog)
 {
        const zil_header_t *zh = zilog->zl_header;
-       lwb_t *lwb;
+       lwb_t *lwb = NULL;
        uint64_t txg = 0;
        dmu_tx_t *tx = NULL;
        blkptr_t blk;
@@ -352,22 +468,23 @@ zil_create(zilog_t *zilog)
        blk = zh->zh_log;
 
        /*
-        * If we don't already have an initial log block or we have one
-        * but it's the wrong endianness then allocate one.
+        * Allocate an initial log block if:
+        *    - there isn't one already
+        *    - the existing block is the wrong endianess
         */
        if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
                tx = dmu_tx_create(zilog->zl_os);
-               (void) dmu_tx_assign(tx, TXG_WAIT);
+               VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
                dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
                txg = dmu_tx_get_txg(tx);
 
                if (!BP_IS_HOLE(&blk)) {
-                       zio_free_blk(zilog->zl_spa, &blk, txg);
+                       zio_free_zil(zilog->zl_spa, txg, &blk);
                        BP_ZERO(&blk);
                }
 
-               error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
-                   NULL, txg);
+               error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+                   ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
                if (error == 0)
                        zil_init_log_chain(zilog, &blk);
@@ -376,20 +493,8 @@ zil_create(zilog_t *zilog)
        /*
         * Allocate a log write buffer (lwb) for the first log block.
         */
-       if (error == 0) {
-               lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-               lwb->lwb_zilog = zilog;
-               lwb->lwb_blk = blk;
-               lwb->lwb_nused = 0;
-               lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
-               lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
-               lwb->lwb_max_txg = txg;
-               lwb->lwb_zio = NULL;
-
-               mutex_enter(&zilog->zl_lock);
-               list_insert_tail(&zilog->zl_lwb_list, lwb);
-               mutex_exit(&zilog->zl_lock);
-       }
+       if (error == 0)
+               lwb = zil_alloc_lwb(zilog, &blk, txg);
 
        /*
         * If we just allocated the first log block, commit our transaction
@@ -402,6 +507,8 @@ zil_create(zilog_t *zilog)
        }
 
        ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+       return (lwb);
 }
 
 /*
@@ -426,26 +533,18 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
         */
        txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
+       zilog->zl_old_header = *zh;             /* debugging aid */
+
        if (BP_IS_HOLE(&zh->zh_log))
                return;
 
        tx = dmu_tx_create(zilog->zl_os);
-       (void) dmu_tx_assign(tx, TXG_WAIT);
+       VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
        dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
        txg = dmu_tx_get_txg(tx);
 
        mutex_enter(&zilog->zl_lock);
 
-       /*
-        * It is possible for the ZIL to get the previously mounted zilog
-        * structure of the same dataset if quickly remounted and the dbuf
-        * eviction has not completed. In this case we can see a non
-        * empty lwb list and keep_first will be set. We fix this by
-        * clearing the keep_first. This will be slower but it's very rare.
-        */
-       if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
-               keep_first = B_FALSE;
-
        ASSERT3U(zilog->zl_destroy_txg, <, txg);
        zilog->zl_destroy_txg = txg;
        zilog->zl_keep_first = keep_first;
@@ -457,41 +556,20 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
                        list_remove(&zilog->zl_lwb_list, lwb);
                        if (lwb->lwb_buf != NULL)
                                zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-                       zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+                       zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
                        kmem_cache_free(zil_lwb_cache, lwb);
                }
-       } else {
-               if (!keep_first) {
-                       (void) zil_parse(zilog, zil_free_log_block,
-                           zil_free_log_record, tx, zh->zh_claim_txg);
-               }
+       } else if (!keep_first) {
+               (void) zil_parse(zilog, zil_free_log_block,
+                   zil_free_log_record, tx, zh->zh_claim_txg);
        }
        mutex_exit(&zilog->zl_lock);
 
        dmu_tx_commit(tx);
 }
 
-/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
-       const zil_header_t *zh = zilog->zl_header;
-       arc_buf_t *abuf = NULL;
-
-       if (BP_IS_HOLE(&zh->zh_log))
-               return (B_TRUE);
-
-       if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
-               return (B_TRUE);
-
-       VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-       return (B_FALSE);
-}
-
 int
-zil_claim(char *osname, void *txarg)
+zil_claim(const char *osname, void *txarg)
 {
        dmu_tx_t *tx = txarg;
        uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -500,7 +578,7 @@ zil_claim(char *osname, void *txarg)
        objset_t *os;
        int error;
 
-       error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+       error = dmu_objset_hold(osname, FTAG, &os);
        if (error) {
                cmn_err(CE_WARN, "can't open objset for %s", osname);
                return (0);
@@ -509,28 +587,13 @@ zil_claim(char *osname, void *txarg)
        zilog = dmu_objset_zil(os);
        zh = zil_header_in_syncing_context(zilog);
 
-       if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
+       if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
                if (!BP_IS_HOLE(&zh->zh_log))
-                       zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
+                       zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
                BP_ZERO(&zh->zh_log);
                dsl_dataset_dirty(dmu_objset_ds(os), tx);
-       }
-
-       /*
-        * Record here whether the zil has any records to replay.
-        * If the header block pointer is null or the block points
-        * to the stubby then we know there are no valid log records.
-        * We use the header to store this state as the the zilog gets
-        * freed later in dmu_objset_close().
-        * The flags (and the rest of the header fields) are cleared in
-        * zil_sync() as a result of a zil_destroy(), after replaying the log.
-        *
-        * Note, the intent log can be empty but still need the
-        * stubby to be claimed.
-        */
-       if (!zil_empty(zilog)) {
-               zh->zh_flags |= ZIL_REPLAY_NEEDED;
-               dsl_dataset_dirty(dmu_objset_ds(os), tx);
+               dmu_objset_rele(os, FTAG);
+               return (0);
        }
 
        /*
@@ -542,14 +605,19 @@ zil_claim(char *osname, void *txarg)
         */
        ASSERT3U(zh->zh_claim_txg, <=, first_txg);
        if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
-               zh->zh_claim_txg = first_txg;
-               zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+               (void) zil_parse(zilog, zil_claim_log_block,
                    zil_claim_log_record, tx, first_txg);
+               zh->zh_claim_txg = first_txg;
+               zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+               zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+               if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+                       zh->zh_flags |= ZIL_REPLAY_NEEDED;
+               zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
                dsl_dataset_dirty(dmu_objset_ds(os), tx);
        }
 
        ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
-       dmu_objset_close(os);
+       dmu_objset_rele(os, FTAG);
        return (0);
 }
 
@@ -558,46 +626,36 @@ zil_claim(char *osname, void *txarg)
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
-/* ARGSUSED */
 int
-zil_check_log_chain(char *osname, void *txarg)
+zil_check_log_chain(const char *osname, void *tx)
 {
        zilog_t *zilog;
-       zil_header_t *zh;
-       blkptr_t blk;
-       arc_buf_t *abuf;
        objset_t *os;
-       char *lrbuf;
-       zil_trailer_t *ztp;
        int error;
 
-       error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+       ASSERT(tx == NULL);
+
+       error = dmu_objset_hold(osname, FTAG, &os);
        if (error) {
                cmn_err(CE_WARN, "can't open objset for %s", osname);
                return (0);
        }
 
        zilog = dmu_objset_zil(os);
-       zh = zil_header_in_syncing_context(zilog);
-       blk = zh->zh_log;
-       if (BP_IS_HOLE(&blk)) {
-               dmu_objset_close(os);
-               return (0); /* no chain */
-       }
 
-       for (;;) {
-               error = zil_read_log_block(zilog, &blk, &abuf);
-               if (error)
-                       break;
-               lrbuf = abuf->b_data;
-               ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-               blk = ztp->zit_next_blk;
-               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-       }
-       dmu_objset_close(os);
-       if (error == ECKSUM)
-               return (0); /* normal end of chain */
-       return (error);
+       /*
+        * Because tx == NULL, zil_claim_log_block() will not actually claim
+        * any blocks, but just determine whether it is possible to do so.
+        * In addition to checking the log chain, zil_claim_log_block()
+        * will invoke zio_claim() with a done func of spa_claim_notify(),
+        * which will update spa_max_claim_txg.  See spa_load() for details.
+        */
+       error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+           zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+
+       dmu_objset_rele(os, FTAG);
+
+       return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 static int
@@ -615,7 +673,7 @@ zil_vdev_compare(const void *x1, const void *x2)
 }
 
 void
-zil_add_block(zilog_t *zilog, blkptr_t *bp)
+zil_add_block(zilog_t *zilog, const blkptr_t *bp)
 {
        avl_tree_t *t = &zilog->zl_vdev_tree;
        avl_index_t where;
@@ -691,9 +749,9 @@ zil_lwb_write_done(zio_t *zio)
 {
        lwb_t *lwb = zio->io_private;
        zilog_t *zilog = lwb->lwb_zilog;
+       dmu_tx_t *tx = lwb->lwb_tx;
 
        ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-       ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
        ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
        ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
        ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
@@ -712,17 +770,15 @@ zil_lwb_write_done(zio_t *zio)
        zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
        mutex_enter(&zilog->zl_lock);
        lwb->lwb_buf = NULL;
-       if (zio->io_error)
-               zilog->zl_log_error = B_TRUE;
+       lwb->lwb_tx = NULL;
+       mutex_exit(&zilog->zl_lock);
 
        /*
         * Now that we've written this log block, we have a stable pointer
         * to the next block in the chain, so it's OK to let the txg in
-        * which we allocated the next block sync. We still have the
-        * zl_lock to ensure zil_sync doesn't kmem free the lwb.
+        * which we allocated the next block sync.
         */
-       txg_rele_to_sync(&lwb->lwb_txgh);
-       mutex_exit(&zilog->zl_lock);
+       dmu_tx_commit(tx);
 }
 
 /*
@@ -733,10 +789,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
        zbookmark_t zb;
 
-       zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-       zb.zb_object = 0;
-       zb.zb_level = -1;
-       zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+       SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+           ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+           lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
        if (zilog->zl_root_zio == NULL) {
                zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
@@ -744,118 +799,147 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
        }
        if (lwb->lwb_zio == NULL) {
                lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-                   0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
+                   0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
                    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
-                   ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
+                   ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
        }
 }
 
 /*
+ * Define a limited set of intent log block sizes.
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+uint64_t zil_block_buckets[] = {
+    4096,              /* non TX_WRITE */
+    8192+4096,         /* data base */
+    32*1024 + 4096,    /* NFS writes */
+    UINT64_MAX
+};
+
+/*
+ * Use the slog as long as the logbias is 'latency' and the current commit size
+ * is less than the limit or the total list size is less than 2X the limit.
+ * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
+ */
+uint64_t zil_slog_limit = 1024 * 1024;
+#define        USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
+       (((zilog)->zl_cur_used < zil_slog_limit) || \
+       ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
+
+/*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
 static lwb_t *
 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 {
-       lwb_t *nlwb;
-       zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+       lwb_t *nlwb = NULL;
+       zil_chain_t *zilc;
        spa_t *spa = zilog->zl_spa;
-       blkptr_t *bp = &ztp->zit_next_blk;
+       blkptr_t *bp;
+       dmu_tx_t *tx;
        uint64_t txg;
-       uint64_t zil_blksz;
-       int error;
+       uint64_t zil_blksz, wsz;
+       int i, error;
+
+       if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+               zilc = (zil_chain_t *)lwb->lwb_buf;
+               bp = &zilc->zc_next_blk;
+       } else {
+               zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+               bp = &zilc->zc_next_blk;
+       }
 
-       ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+       ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
 
        /*
         * Allocate the next block and save its address in this block
         * before writing it in order to establish the log chain.
         * Note that if the allocation of nlwb synced before we wrote
         * the block that points at it (lwb), we'd leak it if we crashed.
-        * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+        * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+        * We dirty the dataset to ensure that zil_sync() will be called
+        * to clean up in the event of allocation failure or I/O failure.
         */
-       txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
-       txg_rele_to_quiesce(&lwb->lwb_txgh);
+       tx = dmu_tx_create(zilog->zl_os);
+       VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+       dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+       txg = dmu_tx_get_txg(tx);
+
+       lwb->lwb_tx = tx;
 
        /*
-        * Pick a ZIL blocksize. We request a size that is the
-        * maximum of the previous used size, the current used size and
-        * the amount waiting in the queue.
+        * Log blocks are pre-allocated. Here we select the size of the next
+        * block, based on size used in the last block.
+        * - first find the smallest bucket that will fit the block from a
+        *   limited set of block sizes. This is because it's faster to write
+        *   blocks allocated from the same metaslab as they are adjacent or
+        *   close.
+        * - next find the maximum from the new suggested size and an array of
+        *   previous sizes. This lessens a picket fence effect of wrongly
+        *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+        *   requests.
+        *
+        * Note we only write what is used, but we can't just allocate
+        * the maximum block size because we can exhaust the available
+        * pool log space.
         */
-       zil_blksz = MAX(zilog->zl_prev_used,
-           zilog->zl_cur_used + sizeof (*ztp));
-       zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
-       zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
-       if (zil_blksz > ZIL_MAX_BLKSZ)
-               zil_blksz = ZIL_MAX_BLKSZ;
+       zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+       for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
+               continue;
+       zil_blksz = zil_block_buckets[i];
+       if (zil_blksz == UINT64_MAX)
+               zil_blksz = SPA_MAXBLOCKSIZE;
+       zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+       for (i = 0; i < ZIL_PREV_BLKS; i++)
+               zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+       zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
        BP_ZERO(bp);
        /* pass the old blkptr in order to spread log blocks across devs */
-       error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
-       if (error) {
-               dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
+       error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
+           USE_SLOG(zilog));
+       if (!error) {
+               ASSERT3U(bp->blk_birth, ==, txg);
+               bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+               bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
                /*
-                * We dirty the dataset to ensure that zil_sync() will
-                * be called to remove this lwb from our zl_lwb_list.
-                * Failing to do so, may leave an lwb with a NULL lwb_buf
-                * hanging around on the zl_lwb_list.
+                * Allocate a new log write buffer (lwb).
                 */
-               dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-               dmu_tx_commit(tx);
-
-               /*
-                * Since we've just experienced an allocation failure so we
-                * terminate the current lwb and send it on its way.
-                */
-               ztp->zit_pad = 0;
-               ztp->zit_nused = lwb->lwb_nused;
-               ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-               zio_nowait(lwb->lwb_zio);
+               nlwb = zil_alloc_lwb(zilog, bp, txg);
 
-               /*
-                * By returning NULL the caller will call tx_wait_synced()
-                */
-               return (NULL);
+               /* Record the block for later vdev flushing */
+               zil_add_block(zilog, &lwb->lwb_blk);
        }
 
-       ASSERT3U(bp->blk_birth, ==, txg);
-       ztp->zit_pad = 0;
-       ztp->zit_nused = lwb->lwb_nused;
-       ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-       bp->blk_cksum = lwb->lwb_blk.blk_cksum;
-       bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+       if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+               /* For Slim ZIL only write what is used. */
+               wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+               ASSERT3U(wsz, <=, lwb->lwb_sz);
+               zio_shrink(lwb->lwb_zio, wsz);
 
-       /*
-        * Allocate a new log write buffer (lwb).
-        */
-       nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+       } else {
+               wsz = lwb->lwb_sz;
+       }
 
-       nlwb->lwb_zilog = zilog;
-       nlwb->lwb_blk = *bp;
-       nlwb->lwb_nused = 0;
-       nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
-       nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
-       nlwb->lwb_max_txg = txg;
-       nlwb->lwb_zio = NULL;
+       zilc->zc_pad = 0;
+       zilc->zc_nused = lwb->lwb_nused;
+       zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 
        /*
-        * Put new lwb at the end of the log chain
+        * clear unused data for security
         */
-       mutex_enter(&zilog->zl_lock);
-       list_insert_tail(&zilog->zl_lwb_list, nlwb);
-       mutex_exit(&zilog->zl_lock);
+       bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
 
-       /* Record the block for later vdev flushing */
-       zil_add_block(zilog, &lwb->lwb_blk);
+       zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
 
        /*
-        * kick off the write for the old log block
+        * If there was an allocation failure then nlwb will be null which
+        * forces a txg_wait_synced().
         */
-       dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
-       ASSERT(lwb->lwb_zio);
-       zio_nowait(lwb->lwb_zio);
-
        return (nlwb);
 }
 
@@ -863,20 +947,20 @@ static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
        lr_t *lrc = &itx->itx_lr; /* common log record */
-       lr_write_t *lr = (lr_write_t *)lrc;
+       lr_write_t *lrw = (lr_write_t *)lrc;
+       char *lr_buf;
        uint64_t txg = lrc->lrc_txg;
        uint64_t reclen = lrc->lrc_reclen;
-       uint64_t dlen;
+       uint64_t dlen = 0;
 
        if (lwb == NULL)
                return (NULL);
+
        ASSERT(lwb->lwb_buf != NULL);
 
        if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
                dlen = P2ROUNDUP_TYPED(
-                   lr->lr_length, sizeof (uint64_t), uint64_t);
-       else
-               dlen = 0;
+                   lrw->lr_length, sizeof (uint64_t), uint64_t);
 
        zilog->zl_cur_used += (reclen + dlen);
 
@@ -885,24 +969,22 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
        /*
         * If this record won't fit in the current log block, start a new one.
         */
-       if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+       if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
                lwb = zil_lwb_write_start(zilog, lwb);
                if (lwb == NULL)
                        return (NULL);
                zil_lwb_write_init(zilog, lwb);
-               ASSERT(lwb->lwb_nused == 0);
-               if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+               ASSERT(LWB_EMPTY(lwb));
+               if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
                        txg_wait_synced(zilog->zl_dmu_pool, txg);
                        return (lwb);
                }
        }
 
-       /*
-        * Update the lrc_seq, to be log record sequence number. See zil.h
-        * Then copy the record to the log buffer.
-        */
-       lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-       bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+       lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+       bcopy(lrc, lr_buf, reclen);
+       lrc = (lr_t *)lr_buf;
+       lrw = (lr_write_t *)lrc;
 
        /*
         * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -914,18 +996,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                        char *dbuf;
                        int error;
 
-                       /* alignment is guaranteed */
-                       lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
                        if (dlen) {
                                ASSERT(itx->itx_wr_state == WR_NEED_COPY);
-                               dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
-                               lr->lr_common.lrc_reclen += dlen;
+                               dbuf = lr_buf + reclen;
+                               lrw->lr_common.lrc_reclen += dlen;
                        } else {
                                ASSERT(itx->itx_wr_state == WR_INDIRECT);
                                dbuf = NULL;
                        }
                        error = zilog->zl_get_data(
-                           itx->itx_private, lr, dbuf, lwb->lwb_zio);
+                           itx->itx_private, lrw, dbuf, lwb->lwb_zio);
                        if (error == EIO) {
                                txg_wait_synced(zilog->zl_dmu_pool, txg);
                                return (lwb);
@@ -938,9 +1018,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                }
        }
 
+       /*
+        * We're actually making an entry, so update lrc_seq to be the
+        * log record sequence number.  Note that this is generally not
+        * equal to the itx sequence number because not all transactions
+        * are synchronous, and sometimes spa_sync() gets there first.
+        */
+       lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
        lwb->lwb_nused += reclen + dlen;
        lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
-       ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+       ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
        ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
 
        return (lwb);
@@ -962,12 +1049,19 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
        return (itx);
 }
 
+void
+zil_itx_destroy(itx_t *itx)
+{
+       kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
+
 uint64_t
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
        uint64_t seq;
 
        ASSERT(itx->itx_lr.lrc_seq == 0);
+       ASSERT(!zilog->zl_replay);
 
        mutex_enter(&zilog->zl_lock);
        list_insert_tail(&zilog->zl_itx_list, itx);
@@ -1016,8 +1110,7 @@ zil_itx_clean(zilog_t *zilog)
        /* destroy sync'd log transactions */
        while ((itx = list_head(&clean_list)) != NULL) {
                list_remove(&clean_list, itx);
-               kmem_free(itx, offsetof(itx_t, itx_lr)
-                   + itx->itx_lr.lrc_reclen);
+               zil_itx_destroy(itx);
        }
        list_destroy(&clean_list);
 }
@@ -1036,7 +1129,7 @@ zil_clean(zilog_t *zilog)
        if ((itx != NULL) &&
            (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
                (void) taskq_dispatch(zilog->zl_clean_taskq,
-                   (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP);
+                   (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP);
        }
        mutex_exit(&zilog->zl_lock);
 }
@@ -1046,9 +1139,10 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 {
        uint64_t txg;
        uint64_t commit_seq = 0;
-       itx_t *itx, *itx_next = (itx_t *)-1;
+       itx_t *itx, *itx_next;
        lwb_t *lwb;
        spa_t *spa;
+       int error = 0;
 
        zilog->zl_writer = B_TRUE;
        ASSERT(zilog->zl_root_zio == NULL);
@@ -1068,77 +1162,64 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
                                return;
                        }
                        mutex_exit(&zilog->zl_lock);
-                       zil_create(zilog);
+                       lwb = zil_create(zilog);
                        mutex_enter(&zilog->zl_lock);
-                       lwb = list_tail(&zilog->zl_lwb_list);
                }
        }
+       ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
 
        /* Loop through in-memory log transactions filling log blocks. */
        DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-       for (;;) {
+
+       for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
                /*
-                * Find the next itx to push:
-                * Push all transactions related to specified foid and all
-                * other transactions except TX_WRITE, TX_TRUNCATE,
-                * TX_SETATTR and TX_ACL for all other files.
+                * Save the next pointer.  Even though we drop zl_lock below,
+                * all threads that can remove itx list entries (other writers
+                * and zil_itx_clean()) can't do so until they have zl_writer.
                 */
-               if (itx_next != (itx_t *)-1)
-                       itx = itx_next;
-               else
-                       itx = list_head(&zilog->zl_itx_list);
-               for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
-                       if (foid == 0) /* push all foids? */
-                               break;
-                       if (itx->itx_sync) /* push all O_[D]SYNC */
-                               break;
-                       switch (itx->itx_lr.lrc_txtype) {
-                       case TX_SETATTR:
-                       case TX_WRITE:
-                       case TX_TRUNCATE:
-                       case TX_ACL:
-                               /* lr_foid is same offset for these records */
-                               if (((lr_write_t *)&itx->itx_lr)->lr_foid
-                                   != foid) {
-                                       continue; /* skip this record */
-                               }
-                       }
-                       break;
-               }
-               if (itx == NULL)
-                       break;
+               itx_next = list_next(&zilog->zl_itx_list, itx);
+
+               /*
+                * Determine whether to push this itx.
+                * Push all transactions related to specified foid and
+                * all other transactions except those that can be logged
+                * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
+                * for all other files.
+                *
+                * If foid == 0 (meaning "push all foids") or
+                * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
+                */
+               if (foid != 0 && !itx->itx_sync &&
+                   TX_OOO(itx->itx_lr.lrc_txtype) &&
+                   ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
+                       continue; /* skip this record */
 
                if ((itx->itx_lr.lrc_seq > seq) &&
-                   ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-                   (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
+                   ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
+                   (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
                        break;
-               }
 
-               /*
-                * Save the next pointer.  Even though we soon drop
-                * zl_lock all threads that may change the list
-                * (another writer or zil_itx_clean) can't do so until
-                * they have zl_writer.
-                */
-               itx_next = list_next(&zilog->zl_itx_list, itx);
                list_remove(&zilog->zl_itx_list, itx);
                zilog->zl_itx_list_sz -= itx->itx_sod;
+
                mutex_exit(&zilog->zl_lock);
+
                txg = itx->itx_lr.lrc_txg;
                ASSERT(txg);
 
                if (txg > spa_last_synced_txg(spa) ||
                    txg > spa_freeze_txg(spa))
                        lwb = zil_lwb_commit(zilog, itx, lwb);
-               kmem_free(itx, offsetof(itx_t, itx_lr)
-                   + itx->itx_lr.lrc_reclen);
+
+               zil_itx_destroy(itx);
+
                mutex_enter(&zilog->zl_lock);
        }
        DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
        /* determine commit sequence number */
        itx = list_head(&zilog->zl_itx_list);
        if (itx)
-               commit_seq = itx->itx_lr.lrc_seq;
+               commit_seq = itx->itx_lr.lrc_seq - 1;
        else
                commit_seq = zilog->zl_itx_seq;
        mutex_exit(&zilog->zl_lock);
@@ -1155,22 +1236,28 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
         */
        if (zilog->zl_root_zio) {
                DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
-               (void) zio_wait(zilog->zl_root_zio);
+               error = zio_wait(zilog->zl_root_zio);
                zilog->zl_root_zio = NULL;
                DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
                zil_flush_vdevs(zilog);
        }
 
-       if (zilog->zl_log_error || lwb == NULL) {
-               zilog->zl_log_error = 0;
+       if (error || lwb == NULL)
                txg_wait_synced(zilog->zl_dmu_pool, 0);
-       }
 
        mutex_enter(&zilog->zl_lock);
        zilog->zl_writer = B_FALSE;
 
        ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
        zilog->zl_commit_seq = commit_seq;
+
+       /*
+        * Remember the highest committed log sequence number for ztest.
+        * We only update this value when all the log writes succeeded,
+        * because ztest wants to ASSERT that it got the whole log chain.
+        */
+       if (error == 0 && lwb != NULL)
+               zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 }
 
 /*
@@ -1181,7 +1268,7 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 void
 zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
 {
-       if (zilog == NULL || seq == 0)
+       if (zilog->zl_sync == ZFS_SYNC_DISABLED || seq == 0)
                return;
 
        mutex_enter(&zilog->zl_lock);
@@ -1190,7 +1277,7 @@ zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
 
        while (zilog->zl_writer) {
                cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-               if (seq < zilog->zl_commit_seq) {
+               if (seq <= zilog->zl_commit_seq) {
                        mutex_exit(&zilog->zl_lock);
                        return;
                }
@@ -1202,6 +1289,33 @@ zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
 }
 
 /*
+ * Report whether all transactions are committed.
+ */
+static boolean_t
+zil_is_committed(zilog_t *zilog)
+{
+       lwb_t *lwb;
+       boolean_t committed;
+
+       mutex_enter(&zilog->zl_lock);
+
+       while (zilog->zl_writer)
+               cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+       if (!list_is_empty(&zilog->zl_itx_list))
+               committed = B_FALSE;            /* unpushed transactions */
+       else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
+               committed = B_TRUE;             /* intent log never used */
+       else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
+               committed = B_FALSE;            /* zil_sync() not done yet */
+       else
+               committed = B_TRUE;             /* everything synced */
+
+       mutex_exit(&zilog->zl_lock);
+       return (committed);
+}
+
+/*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
@@ -1210,6 +1324,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
        zil_header_t *zh = zil_header_in_syncing_context(zilog);
        uint64_t txg = dmu_tx_get_txg(tx);
        spa_t *spa = zilog->zl_spa;
+       uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
        lwb_t *lwb;
 
        /*
@@ -1223,7 +1338,11 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 
        ASSERT(zilog->zl_stop_sync == 0);
 
-       zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
+       if (*replayed_seq != 0) {
+               ASSERT(zh->zh_replay_seq < *replayed_seq);
+               zh->zh_replay_seq = *replayed_seq;
+               *replayed_seq = 0;
+       }
 
        if (zilog->zl_destroy_txg == txg) {
                blkptr_t blk = zh->zh_log;
@@ -1252,7 +1371,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
                if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
                        break;
                list_remove(&zilog->zl_lwb_list, lwb);
-               zio_free_blk(spa, &lwb->lwb_blk, txg);
+               zio_free_zil(spa, txg, &lwb->lwb_blk);
                kmem_cache_free(zil_lwb_cache, lwb);
 
                /*
@@ -1280,6 +1399,18 @@ zil_fini(void)
        kmem_cache_destroy(zil_lwb_cache);
 }
 
+void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+       zilog->zl_sync = sync;
+}
+
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+       zilog->zl_logbias = logbias;
+}
+
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
@@ -1292,6 +1423,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
        zilog->zl_spa = dmu_objset_spa(os);
        zilog->zl_dmu_pool = dmu_objset_pool(os);
        zilog->zl_destroy_txg = TXG_INITIAL - 1;
+       zilog->zl_logbias = dmu_objset_logbias(os);
+       zilog->zl_sync = dmu_objset_syncprop(os);
 
        mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -1368,7 +1501,7 @@ zil_close(zilog_t *zilog)
        if (!zil_is_committed(zilog)) {
                uint64_t txg;
                dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-               (void) dmu_tx_assign(tx, TXG_WAIT);
+               VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
                dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
                txg = dmu_tx_get_txg(tx);
                dmu_tx_commit(tx);
@@ -1442,88 +1575,88 @@ zil_resume(zilog_t *zilog)
 }
 
 typedef struct zil_replay_arg {
-       objset_t        *zr_os;
        zil_replay_func_t **zr_replay;
        void            *zr_arg;
        boolean_t       zr_byteswap;
-       char            *zr_lrbuf;
+       char            *zr_lr;
 } zil_replay_arg_t;
 
-static void
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+       char name[MAXNAMELEN];
+
+       zilog->zl_replaying_seq--;      /* didn't actually replay this one */
+
+       dmu_objset_name(zilog->zl_os, name);
+
+       cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+           "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+           (u_longlong_t)lr->lrc_seq,
+           (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+           (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+       return (error);
+}
+
+static int
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
        zil_replay_arg_t *zr = zra;
        const zil_header_t *zh = zilog->zl_header;
        uint64_t reclen = lr->lrc_reclen;
        uint64_t txtype = lr->lrc_txtype;
-       char *name;
-       int pass, error;
-
-       if (!zilog->zl_replay)                  /* giving up */
-               return;
+       int error = 0;
 
-       if (lr->lrc_txg < claim_txg)            /* already committed */
-               return;
+       zilog->zl_replaying_seq = lr->lrc_seq;
 
        if (lr->lrc_seq <= zh->zh_replay_seq)   /* already replayed */
-               return;
+               return (0);
+
+       if (lr->lrc_txg < claim_txg)            /* already committed */
+               return (0);
 
        /* Strip case-insensitive bit, still present in log record */
        txtype &= ~TX_CI;
 
-       if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-               error = EINVAL;
-               goto bad;
+       if (txtype == 0 || txtype >= TX_MAX_TYPE)
+               return (zil_replay_error(zilog, lr, EINVAL));
+
+       /*
+        * If this record type can be logged out of order, the object
+        * (lr_foid) may no longer exist.  That's legitimate, not an error.
+        */
+       if (TX_OOO(txtype)) {
+               error = dmu_object_info(zilog->zl_os,
+                   ((lr_ooo_t *)lr)->lr_foid, NULL);
+               if (error == ENOENT || error == EEXIST)
+                       return (0);
        }
 
        /*
         * Make a copy of the data so we can revise and extend it.
         */
-       bcopy(lr, zr->zr_lrbuf, reclen);
+       bcopy(lr, zr->zr_lr, reclen);
+
+       /*
+        * If this is a TX_WRITE with a blkptr, suck in the data.
+        */
+       if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+               error = zil_read_log_data(zilog, (lr_write_t *)lr,
+                   zr->zr_lr + reclen);
+               if (error)
+                       return (zil_replay_error(zilog, lr, error));
+       }
 
        /*
         * The log block containing this lr may have been byteswapped
         * so that we can easily examine common fields like lrc_txtype.
-        * However, the log is a mix of different data types, and only the
+        * However, the log is a mix of different record types, and only the
         * replay vectors know how to byteswap their records.  Therefore, if
         * the lr was byteswapped, undo it before invoking the replay vector.
         */
        if (zr->zr_byteswap)
-               byteswap_uint64_array(zr->zr_lrbuf, reclen);
-
-       /*
-        * If this is a TX_WRITE with a blkptr, suck in the data.
-        */
-       if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
-               lr_write_t *lrw = (lr_write_t *)lr;
-               blkptr_t *wbp = &lrw->lr_blkptr;
-               uint64_t wlen = lrw->lr_length;
-               char *wbuf = zr->zr_lrbuf + reclen;
-
-               if (BP_IS_HOLE(wbp)) {  /* compressed to a hole */
-                       bzero(wbuf, wlen);
-               } else {
-                       /*
-                        * A subsequent write may have overwritten this block,
-                        * in which case wbp may have been been freed and
-                        * reallocated, and our read of wbp may fail with a
-                        * checksum error.  We can safely ignore this because
-                        * the later write will provide the correct data.
-                        */
-                       zbookmark_t zb;
-
-                       zb.zb_objset = dmu_objset_id(zilog->zl_os);
-                       zb.zb_object = lrw->lr_foid;
-                       zb.zb_level = -1;
-                       zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
-
-                       (void) zio_wait(zio_read(NULL, zilog->zl_spa,
-                           wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
-                           ZIO_PRIORITY_SYNC_READ,
-                           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
-                       (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
-               }
-       }
+               byteswap_uint64_array(zr->zr_lr, reclen);
 
        /*
         * We must now do two things atomically: replay this log record,
@@ -1531,42 +1664,30 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
         * we did so. At the end of each replay function the sequence number
         * is updated if we are in replay mode.
         */
-       for (pass = 1; pass <= 2; pass++) {
-               zilog->zl_replaying_seq = lr->lrc_seq;
-               /* Only byteswap (if needed) on the 1st pass.  */
-               error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-                   zr->zr_byteswap && pass == 1);
-
-               if (!error)
-                       return;
-
+       error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+       if (error) {
                /*
                 * The DMU's dnode layer doesn't see removes until the txg
                 * commits, so a subsequent claim can spuriously fail with
                 * EEXIST. So if we receive any error we try syncing out
-                * any removes then retry the transaction.
+                * any removes then retry the transaction.  Note that we
+                * specify B_FALSE for byteswap now, so we don't do it twice.
                 */
-               if (pass == 1)
-                       txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+               txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+               error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+               if (error)
+                       return (zil_replay_error(zilog, lr, error));
        }
-
-bad:
-       ASSERT(error);
-       name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-       dmu_objset_name(zr->zr_os, name);
-       cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-           "dataset %s, seq 0x%llx, txtype %llu %s\n",
-           error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
-           (lr->lrc_txtype & TX_CI) ? "CI" : "");
-       zilog->zl_replay = B_FALSE;
-       kmem_free(name, MAXNAMELEN);
+       return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
        zilog->zl_replay_blks++;
+
+       return (0);
 }
 
 /*
@@ -1584,11 +1705,10 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
                return;
        }
 
-       zr.zr_os = os;
        zr.zr_replay = replay_func;
        zr.zr_arg = arg;
        zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
-       zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+       zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
        /*
         * Wait for in-progress removes to sync before starting replay.
@@ -1596,69 +1716,42 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
        txg_wait_synced(zilog->zl_dmu_pool, 0);
 
        zilog->zl_replay = B_TRUE;
-       zilog->zl_replay_time = lbolt;
+       zilog->zl_replay_time = ddi_get_lbolt();
        ASSERT(zilog->zl_replay_blks == 0);
        (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
            zh->zh_claim_txg);
-       kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+       kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
        zil_destroy(zilog, B_FALSE);
        txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
        zilog->zl_replay = B_FALSE;
 }
 
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
-       lwb_t *lwb;
-       int ret;
-
-       mutex_enter(&zilog->zl_lock);
-       while (zilog->zl_writer)
-               cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
-       /* recent unpushed intent log transactions? */
-       if (!list_is_empty(&zilog->zl_itx_list)) {
-               ret = B_FALSE;
-               goto out;
-       }
-
-       /* intent log never used? */
-       lwb = list_head(&zilog->zl_lwb_list);
-       if (lwb == NULL) {
-               ret = B_TRUE;
-               goto out;
-       }
+       if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+               return (B_TRUE);
 
-       /*
-        * more than 1 log buffer means zil_sync() hasn't yet freed
-        * entries after a txg has committed
-        */
-       if (list_next(&zilog->zl_lwb_list, lwb)) {
-               ret = B_FALSE;
-               goto out;
+       if (zilog->zl_replay) {
+               dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+               zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+                   zilog->zl_replaying_seq;
+               return (B_TRUE);
        }
 
-       ASSERT(zil_empty(zilog));
-       ret = B_TRUE;
-out:
-       cv_broadcast(&zilog->zl_cv_writer);
-       mutex_exit(&zilog->zl_lock);
-       return (ret);
+       return (B_FALSE);
 }
 
 /* ARGSUSED */
 int
-zil_vdev_offline(char *osname, void *arg)
+zil_vdev_offline(const char *osname, void *arg)
 {
        objset_t *os;
        zilog_t *zilog;
        int error;
 
-       error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+       error = dmu_objset_hold(osname, FTAG, &os);
        if (error)
                return (error);
 
@@ -1667,6 +1760,6 @@ zil_vdev_offline(char *osname, void *arg)
                error = EEXIST;
        else
                zil_resume(zilog);
-       dmu_objset_close(os);
+       dmu_objset_rele(os, FTAG);
        return (error);
 }
index a2bdab9..88d80af 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -32,6 +31,9 @@
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
 
 /*
  * ==========================================================================
@@ -42,13 +44,15 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
        0,      /* ZIO_PRIORITY_NOW             */
        0,      /* ZIO_PRIORITY_SYNC_READ       */
        0,      /* ZIO_PRIORITY_SYNC_WRITE      */
-       6,      /* ZIO_PRIORITY_ASYNC_READ      */
-       4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
-       4,      /* ZIO_PRIORITY_FREE            */
-       0,      /* ZIO_PRIORITY_CACHE_FILL      */
        0,      /* ZIO_PRIORITY_LOG_WRITE       */
+       1,      /* ZIO_PRIORITY_CACHE_FILL      */
+       1,      /* ZIO_PRIORITY_AGG             */
+       4,      /* ZIO_PRIORITY_FREE            */
+       4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
+       6,      /* ZIO_PRIORITY_ASYNC_READ      */
        10,     /* ZIO_PRIORITY_RESILVER        */
        20,     /* ZIO_PRIORITY_SCRUB           */
+       2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
 };
 
 /*
@@ -57,11 +61,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  * ==========================================================================
  */
 char *zio_type_name[ZIO_TYPES] = {
-       "null", "read", "write", "free", "claim", "ioctl" };
-
-#define        SYNC_PASS_DEFERRED_FREE 1       /* defer frees after this pass */
-#define        SYNC_PASS_DONT_COMPRESS 4       /* don't compress after this pass */
-#define        SYNC_PASS_REWRITE       1       /* rewrite new bps after this pass */
+       "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+       "zio_ioctl"
+};
 
 /*
  * ==========================================================================
@@ -81,8 +83,15 @@ extern vmem_t *zio_alloc_arena;
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
-#define        IO_IS_ALLOCATING(zio) \
-       ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define        IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+boolean_t      zio_requeue_io_start_cut_in_line = B_TRUE;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
 
 void
 zio_init(void)
@@ -124,12 +133,13 @@ zio_init(void)
                        char name[36];
                        (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
                        zio_buf_cache[c] = kmem_cache_create(name, size,
-                           align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+                           align, NULL, NULL, NULL, NULL, NULL,
+                           size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
 
                        (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
                        zio_data_buf_cache[c] = kmem_cache_create(name, size,
                            align, NULL, NULL, NULL, NULL, data_alloc_arena,
-                           KMC_NODEBUG);
+                           size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
                }
        }
 
@@ -264,7 +274,8 @@ zio_pop_transforms(zio_t *zio)
                        zt->zt_transform(zio,
                            zt->zt_orig_data, zt->zt_orig_size);
 
-               zio_buf_free(zio->io_data, zt->zt_bufsize);
+               if (zt->zt_bufsize != 0)
+                       zio_buf_free(zio->io_data, zt->zt_bufsize);
 
                zio->io_data = zt->zt_orig_data;
                zio->io_size = zt->zt_orig_size;
@@ -293,7 +304,7 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
 {
        if (zio->io_error == 0 &&
            zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-           zio->io_data, zio->io_size, data, size) != 0)
+           zio->io_data, data, zio->io_size, size) != 0)
                zio->io_error = EIO;
 }
 
@@ -378,6 +389,9 @@ zio_add_child(zio_t *pio, zio_t *cio)
        list_insert_head(&pio->io_child_list, zl);
        list_insert_head(&cio->io_parent_list, zl);
 
+       pio->io_child_count++;
+       cio->io_parent_count++;
+
        mutex_exit(&pio->io_lock);
        mutex_exit(&cio->io_lock);
 }
@@ -394,6 +408,9 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
        list_remove(&pio->io_child_list, zl);
        list_remove(&cio->io_parent_list, zl);
 
+       pio->io_child_count--;
+       cio->io_parent_count--;
+
        mutex_exit(&pio->io_lock);
        mutex_exit(&cio->io_lock);
 
@@ -409,7 +426,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
        mutex_enter(&zio->io_lock);
        ASSERT(zio->io_stall == NULL);
        if (*countp != 0) {
-               zio->io_stage--;
+               zio->io_stage >>= 1;
                zio->io_stall = countp;
                waiting = B_TRUE;
        }
@@ -451,10 +468,11 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
  * ==========================================================================
  */
 static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
-    const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
+    zio_type_t type, int priority, enum zio_flag flags,
+    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+    enum zio_stage stage, enum zio_stage pipeline)
 {
        zio_t *zio;
 
@@ -481,14 +499,17 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
                zio->io_child_type = ZIO_CHILD_VDEV;
        else if (flags & ZIO_FLAG_GANG_CHILD)
                zio->io_child_type = ZIO_CHILD_GANG;
+       else if (flags & ZIO_FLAG_DDT_CHILD)
+               zio->io_child_type = ZIO_CHILD_DDT;
        else
                zio->io_child_type = ZIO_CHILD_LOGICAL;
 
        if (bp != NULL) {
-               zio->io_bp = bp;
+               zio->io_bp = (blkptr_t *)bp;
                zio->io_bp_copy = *bp;
                zio->io_bp_orig = *bp;
-               if (type != ZIO_TYPE_WRITE)
+               if (type != ZIO_TYPE_WRITE ||
+                   zio->io_child_type == ZIO_CHILD_DDT)
                        zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
                if (zio->io_child_type == ZIO_CHILD_LOGICAL)
                        zio->io_logical = zio;
@@ -498,14 +519,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
        zio->io_spa = spa;
        zio->io_txg = txg;
-       zio->io_data = data;
-       zio->io_size = size;
        zio->io_done = done;
        zio->io_private = private;
        zio->io_type = type;
        zio->io_priority = priority;
        zio->io_vd = vd;
        zio->io_offset = offset;
+       zio->io_orig_data = zio->io_data = data;
+       zio->io_orig_size = zio->io_size = size;
        zio->io_orig_flags = zio->io_flags = flags;
        zio->io_orig_stage = zio->io_stage = stage;
        zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -539,7 +560,7 @@ zio_destroy(zio_t *zio)
 
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
-    void *private, int flags)
+    void *private, enum zio_flag flags)
 {
        zio_t *zio;
 
@@ -551,7 +572,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 }
 
 zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 {
        return (zio_null(NULL, spa, NULL, done, private, flags));
 }
@@ -559,33 +580,24 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb)
+    int priority, enum zio_flag flags, const zbookmark_t *zb)
 {
        zio_t *zio;
 
-       zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+       zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
            data, size, done, private,
            ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
-           ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+           ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+           ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
        return (zio);
 }
 
-void
-zio_skip_write(zio_t *zio)
-{
-       ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-       ASSERT(zio->io_stage == ZIO_STAGE_READY);
-       ASSERT(!BP_IS_GANG(zio->io_bp));
-
-       zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-}
-
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_prop_t *zp,
+    void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb)
+    int priority, enum zio_flag flags, const zbookmark_t *zb)
 {
        zio_t *zio;
 
@@ -595,13 +607,15 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
            zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
            zp->zp_type < DMU_OT_NUMTYPES &&
            zp->zp_level < 32 &&
-           zp->zp_ndvas > 0 &&
-           zp->zp_ndvas <= spa_max_replication(spa));
-       ASSERT(ready != NULL);
+           zp->zp_copies > 0 &&
+           zp->zp_copies <= spa_max_replication(spa) &&
+           zp->zp_dedup <= 1 &&
+           zp->zp_dedup_verify <= 1);
 
        zio = zio_create(pio, spa, txg, bp, data, size, done, private,
            ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
-           ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+           ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+           ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
        zio->io_ready = ready;
        zio->io_prop = *zp;
@@ -612,7 +626,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+    enum zio_flag flags, zbookmark_t *zb)
 {
        zio_t *zio;
 
@@ -623,33 +637,47 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
        return (zio);
 }
 
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+{
+       ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+       ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+       ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+       zio->io_prop.zp_copies = copies;
+       zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+       bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+}
+
 zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags)
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    enum zio_flag flags)
 {
        zio_t *zio;
 
-       ASSERT(!BP_IS_HOLE(bp));
+       dprintf_bp(bp, "freeing in txg %llu, pass %u",
+           (longlong_t)txg, spa->spa_sync_pass);
 
-       if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
-               return (zio_null(pio, spa, NULL, NULL, NULL, flags));
-
-       if (txg == spa->spa_syncing_txg &&
-           spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
-               bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-               return (zio_null(pio, spa, NULL, NULL, NULL, flags));
-       }
+       ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(spa_syncing_txg(spa) == txg);
+       ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
 
        zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
-           done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+           NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
            NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 
        return (zio);
 }
 
 zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags)
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
 {
        zio_t *zio;
 
@@ -663,9 +691,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
         *
         * All claims *must* be resolved in the first txg -- before the SPA
         * starts allocating blocks -- so that nothing is allocated twice.
+        * If txg == 0 we just verify that the block is claimable.
         */
        ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-       ASSERT3U(spa_first_txg(spa), <=, txg);
+       ASSERT(txg == spa_first_txg(spa) || txg == 0);
+       ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 
        zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
            done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
@@ -676,7 +706,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
 {
        zio_t *zio;
        int c;
@@ -701,7 +731,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags, boolean_t labels)
+    int priority, enum zio_flag flags, boolean_t labels)
 {
        zio_t *zio;
 
@@ -722,7 +752,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags, boolean_t labels)
+    int priority, enum zio_flag flags, boolean_t labels)
 {
        zio_t *zio;
 
@@ -737,9 +767,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 
        zio->io_prop.zp_checksum = checksum;
 
-       if (zio_checksum_table[checksum].ci_zbt) {
+       if (zio_checksum_table[checksum].ci_eck) {
                /*
-                * zbt checksums are necessarily destructive -- they modify
+                * zec checksums are necessarily destructive -- they modify
                 * the end of the write buffer to hold the verifier/checksum.
                 * Therefore, we must make a local copy in case the data is
                 * being written to multiple places in parallel.
@@ -757,10 +787,10 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-       void *data, uint64_t size, int type, int priority, int flags,
+       void *data, uint64_t size, int type, int priority, enum zio_flag flags,
        zio_done_func_t *done, void *private)
 {
-       uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+       enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
        zio_t *zio;
 
        ASSERT(vd->vdev_parent ==
@@ -773,26 +803,33 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
                 * detection as close to the leaves as possible and
                 * eliminates redundant checksums in the interior nodes.
                 */
-               pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
-               pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+               pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+               pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
        }
 
        if (vd->vdev_children == 0)
                offset += VDEV_LABEL_START_SIZE;
 
+       flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+       /*
+        * If we've decided to do a repair, the write is not speculative --
+        * even if the original read was.
+        */
+       if (flags & ZIO_FLAG_IO_REPAIR)
+               flags &= ~ZIO_FLAG_SPECULATIVE;
+
        zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
-           done, private, type, priority,
-           (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
-           vd, offset, &pio->io_bookmark,
-           ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+           done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+           ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 
        return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-       int type, int priority, int flags, zio_done_func_t *done, void *private)
+       int type, int priority, enum zio_flag flags,
+       zio_done_func_t *done, void *private)
 {
        zio_t *zio;
 
@@ -802,7 +839,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
            data, size, done, private, type, priority,
            flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
            vd, offset, NULL,
-           ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
+           ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
        return (zio);
 }
@@ -815,6 +852,23 @@ zio_flush(zio_t *zio, vdev_t *vd)
            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+       ASSERT(zio->io_executor == NULL);
+       ASSERT(zio->io_orig_size == zio->io_size);
+       ASSERT(size <= zio->io_size);
+
+       /*
+        * We don't shrink for raidz because of problems with the
+        * reconstruction when reading back less than the block size.
+        * Note, BP_IS_RAIDZ() assumes no compression.
+        */
+       ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+       if (!BP_IS_RAIDZ(zio->io_bp))
+               zio->io_orig_size = zio->io_size = size;
+}
+
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
@@ -829,28 +883,33 @@ zio_read_bp_init(zio_t *zio)
        if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
            zio->io_child_type == ZIO_CHILD_LOGICAL &&
            !(zio->io_flags & ZIO_FLAG_RAW)) {
-               uint64_t csize = BP_GET_PSIZE(bp);
-               void *cbuf = zio_buf_alloc(csize);
+               uint64_t psize = BP_GET_PSIZE(bp);
+               void *cbuf = zio_buf_alloc(psize);
 
-               zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
+               zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
        }
 
        if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
                zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
+       if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+               zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+       if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+               zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
        return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_bp_init(zio_t *zio)
 {
+       spa_t *spa = zio->io_spa;
        zio_prop_t *zp = &zio->io_prop;
-       int compress = zp->zp_compress;
+       enum zio_compress compress = zp->zp_compress;
        blkptr_t *bp = zio->io_bp;
-       void *cbuf;
        uint64_t lsize = zio->io_size;
-       uint64_t csize = lsize;
-       uint64_t cbufsize = 0;
+       uint64_t psize = lsize;
        int pass = 1;
 
        /*
@@ -864,7 +923,29 @@ zio_write_bp_init(zio_t *zio)
        if (!IO_IS_ALLOCATING(zio))
                return (ZIO_PIPELINE_CONTINUE);
 
-       ASSERT(compress != ZIO_COMPRESS_INHERIT);
+       ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+       if (zio->io_bp_override) {
+               ASSERT(bp->blk_birth != zio->io_txg);
+               ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+               *bp = *zio->io_bp_override;
+               zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+               if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+                       return (ZIO_PIPELINE_CONTINUE);
+
+               ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+                   zp->zp_dedup_verify);
+
+               if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+                       BP_SET_DEDUP(bp, 1);
+                       zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+                       return (ZIO_PIPELINE_CONTINUE);
+               }
+               zio->io_bp_override = NULL;
+               BP_ZERO(bp);
+       }
 
        if (bp->blk_birth == zio->io_txg) {
                /*
@@ -876,22 +957,29 @@ zio_write_bp_init(zio_t *zio)
                 * convergence take longer.  Therefore, after the first
                 * few passes, stop compressing to ensure convergence.
                 */
-               pass = spa_sync_pass(zio->io_spa);
+               pass = spa_sync_pass(spa);
+
+               ASSERT(zio->io_txg == spa_syncing_txg(spa));
+               ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+               ASSERT(!BP_GET_DEDUP(bp));
 
                if (pass > SYNC_PASS_DONT_COMPRESS)
                        compress = ZIO_COMPRESS_OFF;
 
                /* Make sure someone doesn't change their mind on overwrites */
-               ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
-                   spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+               ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+                   spa_max_replication(spa)) == BP_GET_NDVAS(bp));
        }
 
        if (compress != ZIO_COMPRESS_OFF) {
-               if (!zio_compress_data(compress, zio->io_data, zio->io_size,
-                   &cbuf, &csize, &cbufsize)) {
+               void *cbuf = zio_buf_alloc(lsize);
+               psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+               if (psize == 0 || psize == lsize) {
                        compress = ZIO_COMPRESS_OFF;
-               } else if (csize != 0) {
-                       zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+                       zio_buf_free(cbuf, lsize);
+               } else {
+                       ASSERT(psize < lsize);
+                       zio_push_transform(zio, cbuf, psize, lsize, NULL);
                }
        }
 
@@ -903,10 +991,10 @@ zio_write_bp_init(zio_t *zio)
         * spa_sync() to allocate new blocks, but force rewrites after that.
         * There should only be a handful of blocks after pass 1 in any case.
         */
-       if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+       if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
            pass > SYNC_PASS_REWRITE) {
-               ASSERT(csize != 0);
-               uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+               ASSERT(psize != 0);
+               enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
                zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
                zio->io_flags |= ZIO_FLAG_IO_REWRITE;
        } else {
@@ -914,17 +1002,36 @@ zio_write_bp_init(zio_t *zio)
                zio->io_pipeline = ZIO_WRITE_PIPELINE;
        }
 
-       if (csize == 0) {
+       if (psize == 0) {
                zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
        } else {
                ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
                BP_SET_LSIZE(bp, lsize);
-               BP_SET_PSIZE(bp, csize);
+               BP_SET_PSIZE(bp, psize);
                BP_SET_COMPRESS(bp, compress);
                BP_SET_CHECKSUM(bp, zp->zp_checksum);
                BP_SET_TYPE(bp, zp->zp_type);
                BP_SET_LEVEL(bp, zp->zp_level);
+               BP_SET_DEDUP(bp, zp->zp_dedup);
                BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+               if (zp->zp_dedup) {
+                       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+                       ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+                       zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+               }
+       }
+
+       return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+       blkptr_t *bp = zio->io_bp;
+
+       if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+               if (BP_GET_DEDUP(bp))
+                       zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
        }
 
        return (ZIO_PIPELINE_CONTINUE);
@@ -937,9 +1044,11 @@ zio_write_bp_init(zio_t *zio)
  */
 
 static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 {
+       spa_t *spa = zio->io_spa;
        zio_type_t t = zio->io_type;
+       int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
 
        /*
         * If we're a config writer or a probe, the normal issue and
@@ -955,8 +1064,16 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
        if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
                t = ZIO_TYPE_NULL;
 
-       (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
-           (task_func_t *)zio_execute, zio, TQ_SLEEP);
+       /*
+        * If this is a high priority I/O, then use the high priority taskq.
+        */
+       if (zio->io_priority == ZIO_PRIORITY_NOW &&
+           spa->spa_zio_taskq[t][q + 1] != NULL)
+               q++;
+
+       ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+       (void) taskq_dispatch(spa->spa_zio_taskq[t][q],
+           (task_func_t *)zio_execute, zio, flags);
 }
 
 static boolean_t
@@ -975,7 +1092,7 @@ zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
 static int
 zio_issue_async(zio_t *zio)
 {
-       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
        return (ZIO_PIPELINE_STOP);
 }
@@ -983,7 +1100,7 @@ zio_issue_async(zio_t *zio)
 void
 zio_interrupt(zio_t *zio)
 {
-       zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
+       zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 /*
@@ -999,7 +1116,7 @@ zio_interrupt(zio_t *zio)
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
+static zio_pipe_stage_t *zio_pipeline[];
 
 void
 zio_execute(zio_t *zio)
@@ -1007,32 +1124,39 @@ zio_execute(zio_t *zio)
        zio->io_executor = curthread;
 
        while (zio->io_stage < ZIO_STAGE_DONE) {
-               uint32_t pipeline = zio->io_pipeline;
-               zio_stage_t stage = zio->io_stage;
+               enum zio_stage pipeline = zio->io_pipeline;
+               enum zio_stage stage = zio->io_stage;
                int rv;
 
                ASSERT(!MUTEX_HELD(&zio->io_lock));
+               ASSERT(ISP2(stage));
+               ASSERT(zio->io_stall == NULL);
 
-               while (((1U << ++stage) & pipeline) == 0)
-                       continue;
+               do {
+                       stage <<= 1;
+               } while ((stage & pipeline) == 0);
 
                ASSERT(stage <= ZIO_STAGE_DONE);
-               ASSERT(zio->io_stall == NULL);
 
                /*
                 * If we are in interrupt context and this pipeline stage
                 * will grab a config lock that is held across I/O,
-                * issue async to avoid deadlock.
+                * or may wait for an I/O that needs an interrupt thread
+                * to complete, issue async to avoid deadlock.
+                *
+                * For VDEV_IO_START, we cut in line so that the io will
+                * be sent to disk promptly.
                 */
-               if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
-                   zio->io_vd == NULL &&
+               if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
                    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
-                       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+                       boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+                           zio_requeue_io_start_cut_in_line : B_FALSE;
+                       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
                        return;
                }
 
                zio->io_stage = stage;
-               rv = zio_pipeline[stage](zio);
+               rv = zio_pipeline[highbit(stage) - 1](zio);
 
                if (rv == ZIO_PIPELINE_STOP)
                        return;
@@ -1115,19 +1239,8 @@ zio_reexecute(zio_t *pio)
        for (int c = 0; c < ZIO_CHILD_TYPES; c++)
                pio->io_child_error[c] = 0;
 
-       if (IO_IS_ALLOCATING(pio)) {
-               /*
-                * Remember the failed bp so that the io_ready() callback
-                * can update its accounting upon reexecution.  The block
-                * was already freed in zio_done(); we indicate this with
-                * a fill count of -1 so that zio_free() knows to skip it.
-                */
-               blkptr_t *bp = pio->io_bp;
-               ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
-               bp->blk_fill = BLK_FILL_ALREADY_FREED;
-               pio->io_bp_orig = *bp;
-               BP_ZERO(bp);
-       }
+       if (IO_IS_ALLOCATING(pio))
+               BP_ZERO(pio->io_bp);
 
        /*
         * As we reexecute pio's children, new children could be created.
@@ -1315,6 +1428,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
                        zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
                            data, BP_GET_PSIZE(bp));
                }
+               /*
+                * If we are here to damage data for testing purposes,
+                * leave the GBH alone so that we can detect the damage.
+                */
+               if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+                       zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
        } else {
                zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
                    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
@@ -1328,8 +1447,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
-       return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
-           NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+       return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+           ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
@@ -1413,7 +1532,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
        blkptr_t *bp = zio->io_bp;
 
        ASSERT(gio == zio_unique_parent(zio));
-       ASSERT(zio_walk_children(zio) == NULL);
+       ASSERT(zio->io_child_count == 0);
 
        if (zio->io_error)
                return;
@@ -1423,7 +1542,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
 
        ASSERT(zio->io_data == gn->gn_gbh);
        ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-       ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+       ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
        for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
                blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1450,7 +1569,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
        zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
 
        if (gn != NULL) {
-               ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+               ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
                for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
                        blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1519,9 +1638,9 @@ zio_write_gang_member_ready(zio_t *zio)
        ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
        ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
-       ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
-       ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
-       ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+       ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+       ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+       ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
        ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
        mutex_enter(&pio->io_lock);
@@ -1546,13 +1665,13 @@ zio_write_gang_block(zio_t *pio)
        uint64_t txg = pio->io_txg;
        uint64_t resid = pio->io_size;
        uint64_t lsize;
-       int ndvas = gio->io_prop.zp_ndvas;
-       int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+       int copies = gio->io_prop.zp_copies;
+       int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
        zio_prop_t zp;
        int error;
 
-       error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
-           bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+       error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+           bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
            METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
        if (error) {
                pio->io_error = error;
@@ -1588,7 +1707,9 @@ zio_write_gang_block(zio_t *pio)
                zp.zp_compress = ZIO_COMPRESS_OFF;
                zp.zp_type = DMU_OT_NONE;
                zp.zp_level = 0;
-               zp.zp_ndvas = gio->io_prop.zp_ndvas;
+               zp.zp_copies = gio->io_prop.zp_copies;
+               zp.zp_dedup = 0;
+               zp.zp_dedup_verify = 0;
 
                zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
                    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1609,15 +1730,383 @@ zio_write_gang_block(zio_t *pio)
 
 /*
  * ==========================================================================
- * Allocate and free blocks
+ * Dedup
  * ==========================================================================
  */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+       blkptr_t *bp = zio->io_bp;
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp;
+       zio_t *pio = zio_unique_parent(zio);
+
+       mutex_enter(&pio->io_lock);
+       ddp = ddt_phys_select(dde, bp);
+       if (zio->io_error == 0)
+               ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
+       if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+               dde->dde_repair_data = zio->io_data;
+       else
+               zio_buf_free(zio->io_data, zio->io_size);
+       mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+       blkptr_t *bp = zio->io_bp;
+
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+       if (zio->io_child_error[ZIO_CHILD_DDT]) {
+               ddt_t *ddt = ddt_select(zio->io_spa, bp);
+               ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+               ddt_phys_t *ddp = dde->dde_phys;
+               ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+               blkptr_t blk;
+
+               ASSERT(zio->io_vsd == NULL);
+               zio->io_vsd = dde;
+
+               if (ddp_self == NULL)
+                       return (ZIO_PIPELINE_CONTINUE);
+
+               for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+                       if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+                               continue;
+                       ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+                           &blk);
+                       zio_nowait(zio_read(zio, zio->io_spa, &blk,
+                           zio_buf_alloc(zio->io_size), zio->io_size,
+                           zio_ddt_child_read_done, dde, zio->io_priority,
+                           ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+                           &zio->io_bookmark));
+               }
+               return (ZIO_PIPELINE_CONTINUE);
+       }
+
+       zio_nowait(zio_read(zio, zio->io_spa, bp,
+           zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+           ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+       return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_read_done(zio_t *zio)
+{
+       blkptr_t *bp = zio->io_bp;
+
+       if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+               return (ZIO_PIPELINE_STOP);
+
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+       if (zio->io_child_error[ZIO_CHILD_DDT]) {
+               ddt_t *ddt = ddt_select(zio->io_spa, bp);
+               ddt_entry_t *dde = zio->io_vsd;
+               if (ddt == NULL) {
+                       ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+                       return (ZIO_PIPELINE_CONTINUE);
+               }
+               if (dde == NULL) {
+                       zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+                       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+                       return (ZIO_PIPELINE_STOP);
+               }
+               if (dde->dde_repair_data != NULL) {
+                       bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+                       zio->io_child_error[ZIO_CHILD_DDT] = 0;
+               }
+               ddt_repair_done(ddt, dde);
+               zio->io_vsd = NULL;
+       }
+
+       ASSERT(zio->io_vsd == NULL);
+
+       return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+       spa_t *spa = zio->io_spa;
+
+       /*
+        * Note: we compare the original data, not the transformed data,
+        * because when zio->io_bp is an override bp, we will not have
+        * pushed the I/O transforms.  That's an important optimization
+        * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+        */
+       for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+               zio_t *lio = dde->dde_lead_zio[p];
+
+               if (lio != NULL) {
+                       return (lio->io_orig_size != zio->io_orig_size ||
+                           bcmp(zio->io_orig_data, lio->io_orig_data,
+                           zio->io_orig_size) != 0);
+               }
+       }
+
+       for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+               ddt_phys_t *ddp = &dde->dde_phys[p];
+
+               if (ddp->ddp_phys_birth != 0) {
+                       arc_buf_t *abuf = NULL;
+                       uint32_t aflags = ARC_WAIT;
+                       blkptr_t blk = *zio->io_bp;
+                       int error;
+
+                       ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+                       ddt_exit(ddt);
+
+                       error = arc_read_nolock(NULL, spa, &blk,
+                           arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+                           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+                           &aflags, &zio->io_bookmark);
+
+                       if (error == 0) {
+                               if (arc_buf_size(abuf) != zio->io_orig_size ||
+                                   bcmp(abuf->b_data, zio->io_orig_data,
+                                   zio->io_orig_size) != 0)
+                                       error = EEXIST;
+                               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+                       }
+
+                       ddt_enter(ddt);
+                       return (error != 0);
+               }
+       }
+
+       return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+       int p = zio->io_prop.zp_copies;
+       ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp = &dde->dde_phys[p];
+       zio_t *pio;
+
+       if (zio->io_error)
+               return;
+
+       ddt_enter(ddt);
+
+       ASSERT(dde->dde_lead_zio[p] == zio);
+
+       ddt_phys_fill(ddp, zio->io_bp);
+
+       while ((pio = zio_walk_parents(zio)) != NULL)
+               ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+       ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+       int p = zio->io_prop.zp_copies;
+       ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp = &dde->dde_phys[p];
+
+       ddt_enter(ddt);
+
+       ASSERT(ddp->ddp_refcnt == 0);
+       ASSERT(dde->dde_lead_zio[p] == zio);
+       dde->dde_lead_zio[p] = NULL;
+
+       if (zio->io_error == 0) {
+               while (zio_walk_parents(zio) != NULL)
+                       ddt_phys_addref(ddp);
+       } else {
+               ddt_phys_clear(ddp);
+       }
+
+       ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+       int p = DDT_PHYS_DITTO;
+       zio_prop_t *zp = &zio->io_prop;
+       blkptr_t *bp = zio->io_bp;
+       ddt_t *ddt = ddt_select(zio->io_spa, bp);
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp = &dde->dde_phys[p];
+       ddt_key_t *ddk = &dde->dde_key;
+
+       ddt_enter(ddt);
+
+       ASSERT(ddp->ddp_refcnt == 0);
+       ASSERT(dde->dde_lead_zio[p] == zio);
+       dde->dde_lead_zio[p] = NULL;
+
+       if (zio->io_error == 0) {
+               ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+               ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+               ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+               if (ddp->ddp_phys_birth != 0)
+                       ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+               ddt_phys_fill(ddp, bp);
+       }
+
+       ddt_exit(ddt);
+}
+
+static int
+zio_ddt_write(zio_t *zio)
+{
+       spa_t *spa = zio->io_spa;
+       blkptr_t *bp = zio->io_bp;
+       uint64_t txg = zio->io_txg;
+       zio_prop_t *zp = &zio->io_prop;
+       int p = zp->zp_copies;
+       int ditto_copies;
+       zio_t *cio = NULL;
+       zio_t *dio = NULL;
+       ddt_t *ddt = ddt_select(spa, bp);
+       ddt_entry_t *dde;
+       ddt_phys_t *ddp;
+
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+       ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+       ddt_enter(ddt);
+       dde = ddt_lookup(ddt, bp, B_TRUE);
+       ddp = &dde->dde_phys[p];
+
+       if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+               /*
+                * If we're using a weak checksum, upgrade to a strong checksum
+                * and try again.  If we're already using a strong checksum,
+                * we can't resolve it, so just convert to an ordinary write.
+                * (And automatically e-mail a paper to Nature?)
+                */
+               if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+                       zp->zp_checksum = spa_dedup_checksum(spa);
+                       zio_pop_transforms(zio);
+                       zio->io_stage = ZIO_STAGE_OPEN;
+                       BP_ZERO(bp);
+               } else {
+                       zp->zp_dedup = 0;
+               }
+               zio->io_pipeline = ZIO_WRITE_PIPELINE;
+               ddt_exit(ddt);
+               return (ZIO_PIPELINE_CONTINUE);
+       }
+
+       ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+       ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+       if (ditto_copies > ddt_ditto_copies_present(dde) &&
+           dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+               zio_prop_t czp = *zp;
+
+               czp.zp_copies = ditto_copies;
+
+               /*
+                * If we arrived here with an override bp, we won't have run
+                * the transform stack, so we won't have the data we need to
+                * generate a child i/o.  So, toss the override bp and restart.
+                * This is safe, because using the override bp is just an
+                * optimization; and it's rare, so the cost doesn't matter.
+                */
+               if (zio->io_bp_override) {
+                       zio_pop_transforms(zio);
+                       zio->io_stage = ZIO_STAGE_OPEN;
+                       zio->io_pipeline = ZIO_WRITE_PIPELINE;
+                       zio->io_bp_override = NULL;
+                       BP_ZERO(bp);
+                       ddt_exit(ddt);
+                       return (ZIO_PIPELINE_CONTINUE);
+               }
+
+               dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+                   zio->io_orig_size, &czp, NULL,
+                   zio_ddt_ditto_write_done, dde, zio->io_priority,
+                   ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+               zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+               dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+       }
+
+       if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+               if (ddp->ddp_phys_birth != 0)
+                       ddt_bp_fill(ddp, bp, txg);
+               if (dde->dde_lead_zio[p] != NULL)
+                       zio_add_child(zio, dde->dde_lead_zio[p]);
+               else
+                       ddt_phys_addref(ddp);
+       } else if (zio->io_bp_override) {
+               ASSERT(bp->blk_birth == txg);
+               ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+               ddt_phys_fill(ddp, bp);
+               ddt_phys_addref(ddp);
+       } else {
+               cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+                   zio->io_orig_size, zp, zio_ddt_child_write_ready,
+                   zio_ddt_child_write_done, dde, zio->io_priority,
+                   ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+               zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+               dde->dde_lead_zio[p] = cio;
+       }
+
+       ddt_exit(ddt);
+
+       if (cio)
+               zio_nowait(cio);
+       if (dio)
+               zio_nowait(dio);
+
+       return (ZIO_PIPELINE_CONTINUE);
+}
+
+ddt_entry_t *freedde; /* for debugging */
 
 static int
+zio_ddt_free(zio_t *zio)
+{
+       spa_t *spa = zio->io_spa;
+       blkptr_t *bp = zio->io_bp;
+       ddt_t *ddt = ddt_select(spa, bp);
+       ddt_entry_t *dde;
+       ddt_phys_t *ddp;
+
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+       ddt_enter(ddt);
+       freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+       ddp = ddt_phys_select(dde, bp);
+       ddt_phys_decref(ddp);
+       ddt_exit(ddt);
+
+       return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static int
 zio_dva_allocate(zio_t *zio)
 {
        spa_t *spa = zio->io_spa;
-       metaslab_class_t *mc = spa->spa_normal_class;
+       metaslab_class_t *mc = spa_normal_class(spa);
        blkptr_t *bp = zio->io_bp;
        int error;
 
@@ -1628,12 +2117,12 @@ zio_dva_allocate(zio_t *zio)
 
        ASSERT(BP_IS_HOLE(bp));
        ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-       ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
-       ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
+       ASSERT3U(zio->io_prop.zp_copies, >, 0);
+       ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
        ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
        error = metaslab_alloc(spa, mc, zio->io_size, bp,
-           zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
+           zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
 
        if (error) {
                if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -1672,36 +2161,11 @@ zio_dva_claim(zio_t *zio)
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-       spa_t *spa = zio->io_spa;
-       boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-
        ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
-       if (zio->io_bp == bp && !now) {
-               /*
-                * This is a rewrite for sync-to-convergence.
-                * We can't do a metaslab_free(NOW) because bp wasn't allocated
-                * during this sync pass, which means that metaslab_sync()
-                * already committed the allocation.
-                */
-               ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
-                   BP_IDENTITY(&zio->io_bp_orig)));
-               ASSERT(spa_sync_pass(spa) > 1);
-
-               if (BP_IS_GANG(bp) && gn == NULL) {
-                       /*
-                        * This is a gang leader whose gang header(s) we
-                        * couldn't read now, so defer the free until later.
-                        * The block should still be intact because without
-                        * the headers, we'd never even start the rewrite.
-                        */
-                       bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-                       return;
-               }
-       }
+       ASSERT(zio->io_bp_override == NULL);
 
        if (!BP_IS_HOLE(bp))
-               metaslab_free(spa, bp, bp->blk_birth, now);
+               metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
        if (gn != NULL) {
                for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -1715,25 +2179,31 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t txg)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
+    uint64_t size, boolean_t use_slog)
 {
-       int error;
+       int error = 1;
 
-       error = metaslab_alloc(spa, spa->spa_log_class, size,
-           new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+       ASSERT(txg > spa_syncing_txg(spa));
+
+       if (use_slog)
+               error = metaslab_alloc(spa, spa_log_class(spa), size,
+                   new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
 
        if (error)
-               error = metaslab_alloc(spa, spa->spa_normal_class, size,
+               error = metaslab_alloc(spa, spa_normal_class(spa), size,
                    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
 
        if (error == 0) {
                BP_SET_LSIZE(new_bp, size);
                BP_SET_PSIZE(new_bp, size);
                BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
-               BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+               BP_SET_CHECKSUM(new_bp,
+                   spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+                   ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
                BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
                BP_SET_LEVEL(new_bp, 0);
+               BP_SET_DEDUP(new_bp, 0);
                BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
        }
 
@@ -1741,15 +2211,15 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
 }
 
 /*
- * Free an intent log block.  We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
+ * Free an intent log block.
  */
 void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
 {
+       ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
        ASSERT(!BP_IS_GANG(bp));
 
-       metaslab_free(spa, bp, txg, B_FALSE);
+       zio_free(spa, txg, bp);
 }
 
 /*
@@ -1878,6 +2348,32 @@ zio_vdev_io_done(zio_t *zio)
        return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+    const void *good_buf)
+{
+       /* no processing needed */
+       zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+       void *buf = zio_buf_alloc(zio->io_size);
+
+       bcopy(zio->io_data, buf, zio->io_size);
+
+       zcr->zcr_cbinfo = zio->io_size;
+       zcr->zcr_cbdata = buf;
+       zcr->zcr_finish = zio_vsd_default_cksum_finish;
+       zcr->zcr_free = zio_buf_free;
+}
+
 static int
 zio_vdev_io_assess(zio_t *zio)
 {
@@ -1890,7 +2386,7 @@ zio_vdev_io_assess(zio_t *zio)
                spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
        if (zio->io_vsd != NULL) {
-               zio->io_vsd_free(zio);
+               zio->io_vsd_ops->vsd_free(zio);
                zio->io_vsd = NULL;
        }
 
@@ -1899,6 +2395,9 @@ zio_vdev_io_assess(zio_t *zio)
 
        /*
         * If the I/O failed, determine whether we should attempt to retry it.
+        *
+        * On retry, we cut in line in the issue queue, since we don't want
+        * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
         */
        if (zio->io_error && vd == NULL &&
            !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
@@ -1907,8 +2406,9 @@ zio_vdev_io_assess(zio_t *zio)
                zio->io_error = 0;
                zio->io_flags |= ZIO_FLAG_IO_RETRY |
                    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
-               zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
-               zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+               zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+               zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+                   zio_requeue_io_start_cut_in_line);
                return (ZIO_PIPELINE_STOP);
        }
 
@@ -1940,7 +2440,7 @@ zio_vdev_io_reissue(zio_t *zio)
        ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
        ASSERT(zio->io_error == 0);
 
-       zio->io_stage--;
+       zio->io_stage >>= 1;
 }
 
 void
@@ -1948,7 +2448,7 @@ zio_vdev_io_redone(zio_t *zio)
 {
        ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
-       zio->io_stage--;
+       zio->io_stage >>= 1;
 }
 
 void
@@ -1958,7 +2458,7 @@ zio_vdev_io_bypass(zio_t *zio)
        ASSERT(zio->io_error == 0);
 
        zio->io_flags |= ZIO_FLAG_IO_BYPASS;
-       zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+       zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
@@ -2000,9 +2500,12 @@ zio_checksum_generate(zio_t *zio)
 static int
 zio_checksum_verify(zio_t *zio)
 {
+       zio_bad_cksum_t info;
        blkptr_t *bp = zio->io_bp;
        int error;
 
+       ASSERT(zio->io_vd != NULL);
+
        if (bp == NULL) {
                /*
                 * This is zio_read_phys().
@@ -2014,11 +2517,12 @@ zio_checksum_verify(zio_t *zio)
                ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
        }
 
-       if ((error = zio_checksum_error(zio)) != 0) {
+       if ((error = zio_checksum_error(zio, &info)) != 0) {
                zio->io_error = error;
                if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-                       zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-                           zio->io_spa, zio->io_vd, zio, 0, 0);
+                       zfs_ereport_start_checksum(zio->io_spa,
+                           zio->io_vd, zio, zio->io_offset,
+                           zio->io_size, NULL, &info);
                }
        }
 
@@ -2031,7 +2535,7 @@ zio_checksum_verify(zio_t *zio)
 void
 zio_checksum_verified(zio_t *zio)
 {
-       zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+       zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
@@ -2071,7 +2575,8 @@ zio_ready(zio_t *zio)
        blkptr_t *bp = zio->io_bp;
        zio_t *pio, *pio_next;
 
-       if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+       if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+           zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
                return (ZIO_PIPELINE_STOP);
 
        if (zio->io_ready) {
@@ -2105,6 +2610,19 @@ zio_ready(zio_t *zio)
                zio_notify_parent(pio, zio, ZIO_WAIT_READY);
        }
 
+       if (zio->io_flags & ZIO_FLAG_NODATA) {
+               if (BP_IS_GANG(bp)) {
+                       zio->io_flags &= ~ZIO_FLAG_NODATA;
+               } else {
+                       ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+                       zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+               }
+       }
+
+       if (zio_injection_enabled &&
+           zio->io_spa->spa_syncing_txg == zio->io_txg)
+               zio_handle_ignored_writes(zio);
+
        return (ZIO_PIPELINE_CONTINUE);
 }
 
@@ -2124,6 +2642,7 @@ zio_done(zio_t *zio)
         */
        if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
            zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+           zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
            zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
                return (ZIO_PIPELINE_STOP);
 
@@ -2134,23 +2653,51 @@ zio_done(zio_t *zio)
        if (bp != NULL) {
                ASSERT(bp->blk_pad[0] == 0);
                ASSERT(bp->blk_pad[1] == 0);
-               ASSERT(bp->blk_pad[2] == 0);
                ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
                    (bp == zio_unique_parent(zio)->io_bp));
                if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+                   zio->io_bp_override == NULL &&
                    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
                        ASSERT(!BP_SHOULD_BYTESWAP(bp));
-                       ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+                       ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
                        ASSERT(BP_COUNT_GANG(bp) == 0 ||
                            (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
                }
        }
 
        /*
-        * If there were child vdev or gang errors, they apply to us now.
+        * If there were child vdev/gang/ddt errors, they apply to us now.
         */
        zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
        zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+       zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+       /*
+        * If the I/O on the transformed data was successful, generate any
+        * checksum reports now while we still have the transformed data.
+        */
+       if (zio->io_error == 0) {
+               while (zio->io_cksum_report != NULL) {
+                       zio_cksum_report_t *zcr = zio->io_cksum_report;
+                       uint64_t align = zcr->zcr_align;
+                       uint64_t asize = P2ROUNDUP(psize, align);
+                       char *abuf = zio->io_data;
+
+                       if (asize != psize) {
+                               abuf = zio_buf_alloc(asize);
+                               bcopy(zio->io_data, abuf, psize);
+                               bzero(abuf + psize, asize - psize);
+                       }
+
+                       zio->io_cksum_report = zcr->zcr_next;
+                       zcr->zcr_next = NULL;
+                       zcr->zcr_finish(zcr, abuf);
+                       zfs_ereport_free_checksum(zcr);
+
+                       if (asize != psize)
+                               zio_buf_free(abuf, asize);
+               }
+       }
 
        zio_pop_transforms(zio);        /* note: may set zio->io_error */
 
@@ -2166,8 +2713,9 @@ zio_done(zio_t *zio)
                if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
                        zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
-               if ((zio->io_error == EIO ||
-                   !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+               if ((zio->io_error == EIO || !(zio->io_flags &
+                   (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+                   zio == lio) {
                        /*
                         * For logical I/O requests, tell the SPA to log the
                         * error and generate a logical data ereport.
@@ -2184,22 +2732,33 @@ zio_done(zio_t *zio)
                 * propagate all the way to the root via zio_notify_parent().
                 */
                ASSERT(vd == NULL && bp != NULL);
+               ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
-               if (IO_IS_ALLOCATING(zio))
+               if (IO_IS_ALLOCATING(zio) &&
+                   !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
                        if (zio->io_error != ENOSPC)
                                zio->io_reexecute |= ZIO_REEXECUTE_NOW;
                        else
                                zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+               }
 
                if ((zio->io_type == ZIO_TYPE_READ ||
                    zio->io_type == ZIO_TYPE_FREE) &&
                    zio->io_error == ENXIO &&
-                   spa->spa_load_state == SPA_LOAD_NONE &&
+                   spa_load_state(spa) == SPA_LOAD_NONE &&
                    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
                        zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
                if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
                        zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+               /*
+                * Here is a possibly good place to attempt to do
+                * either combinatorial reconstruction or error correction
+                * based on checksums.  It also might be a good place
+                * to send out preliminary ereports before we suspend
+                * processing.
+                */
        }
 
        /*
@@ -2210,11 +2769,10 @@ zio_done(zio_t *zio)
         */
        zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
-       if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
-           zio->io_child_type == ZIO_CHILD_LOGICAL) {
-               ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+       if ((zio->io_error || zio->io_reexecute) &&
+           IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+           !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
                zio_dva_unallocate(zio, zio->io_gang_tree, bp);
-       }
 
        zio_gang_tree_free(&zio->io_gang_tree);
 
@@ -2292,11 +2850,22 @@ zio_done(zio_t *zio)
                return (ZIO_PIPELINE_STOP);
        }
 
-       ASSERT(zio_walk_children(zio) == NULL);
+       ASSERT(zio->io_child_count == 0);
        ASSERT(zio->io_reexecute == 0);
        ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
        /*
+        * Report any checksum errors, since the I/O is complete.
+        */
+       while (zio->io_cksum_report != NULL) {
+               zio_cksum_report_t *zcr = zio->io_cksum_report;
+               zio->io_cksum_report = zcr->zcr_next;
+               zcr->zcr_next = NULL;
+               zcr->zcr_finish(zcr, NULL);
+               zfs_ereport_free_checksum(zcr);
+       }
+
+       /*
         * It is the responsibility of the done callback to ensure that this
         * particular zio is no longer discoverable for adoption, and as
         * such, cannot acquire any new parents.
@@ -2332,12 +2901,17 @@ zio_done(zio_t *zio)
  * I/O pipeline definition
  * ==========================================================================
  */
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+static zio_pipe_stage_t *zio_pipeline[] = {
        NULL,
-       zio_issue_async,
        zio_read_bp_init,
+       zio_free_bp_init,
+       zio_issue_async,
        zio_write_bp_init,
        zio_checksum_generate,
+       zio_ddt_read_start,
+       zio_ddt_read_done,
+       zio_ddt_write,
+       zio_ddt_free,
        zio_gang_assemble,
        zio_gang_issue,
        zio_dva_allocate,
index bf7fe73..c8fe20f 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
+#include <sys/zil.h>
+#include <zfs_fletcher.h>
 
 /*
  * Checksum vectors.
  *     we want the ability to take advantage of that hardware.
  *
  * Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
  *
  * When writing a block, we always checksum it with the latest-and-greatest
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  */
 
 /*ARGSUSED*/
@@ -66,19 +67,20 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-       {{NULL,                 NULL},                  0, 0,   "inherit"},
-       {{NULL,                 NULL},                  0, 0,   "on"},
-       {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
-       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "label"},
-       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "gang_header"},
-       {{fletcher_2_native,    fletcher_2_byteswap},   0, 1,   "zilog"},
-       {{fletcher_2_native,    fletcher_2_byteswap},   0, 0,   "fletcher2"},
-       {{fletcher_4_native,    fletcher_4_byteswap},   1, 0,   "fletcher4"},
-       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0,   "SHA256"},
+       {{NULL,                 NULL},                  0, 0, 0, "inherit"},
+       {{NULL,                 NULL},                  0, 0, 0, "on"},
+       {{zio_checksum_off,     zio_checksum_off},      0, 0, 0, "off"},
+       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1, 0, "label"},
+       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1, 0, "gang_header"},
+       {{fletcher_2_native,    fletcher_2_byteswap},   0, 1, 0, "zilog"},
+       {{fletcher_2_native,    fletcher_2_byteswap},   0, 0, 0, "fletcher2"},
+       {{fletcher_4_native,    fletcher_4_byteswap},   1, 0, 0, "fletcher4"},
+       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0, 1, "sha256"},
+       {{fletcher_4_native,    fletcher_4_byteswap},   0, 1, 0, "zilog2"},
 };
 
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
        ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
        ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
@@ -93,6 +95,29 @@ zio_checksum_select(uint8_t child, uint8_t parent)
        return (child);
 }
 
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+    enum zio_checksum parent)
+{
+       ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+       ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+       ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+       if (child == ZIO_CHECKSUM_INHERIT)
+               return (parent);
+
+       if (child == ZIO_CHECKSUM_ON)
+               return (spa_dedup_checksum(spa));
+
+       if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+               return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+       ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+           (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+       return (child);
+}
+
 /*
  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  * a tuple which is guaranteed to be unique for the life of the pool.
@@ -101,7 +126,7 @@ static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
 {
        dva_t *dva = BP_IDENTITY(bp);
-       uint64_t txg = bp->blk_birth;
+       uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 
        ASSERT(BP_IS_GANG(bp));
 
@@ -128,47 +153,79 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 {
        blkptr_t *bp = zio->io_bp;
        uint64_t offset = zio->io_offset;
-       zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
        zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-       zio_cksum_t zbt_cksum;
+       zio_cksum_t cksum;
 
        ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
        ASSERT(ci->ci_func[0] != NULL);
 
-       if (ci->ci_zbt) {
+       if (ci->ci_eck) {
+               zio_eck_t *eck;
+
+               if (checksum == ZIO_CHECKSUM_ZILOG2) {
+                       zil_chain_t *zilc = data;
+
+                       size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+                           uint64_t);
+                       eck = &zilc->zc_eck;
+               } else {
+                       eck = (zio_eck_t *)((char *)data + size) - 1;
+               }
                if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-                       zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+                       zio_checksum_gang_verifier(&eck->zec_cksum, bp);
                else if (checksum == ZIO_CHECKSUM_LABEL)
-                       zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+                       zio_checksum_label_verifier(&eck->zec_cksum, offset);
                else
-                       bp->blk_cksum = zbt->zbt_cksum;
-               zbt->zbt_magic = ZBT_MAGIC;
-               ci->ci_func[0](data, size, &zbt_cksum);
-               zbt->zbt_cksum = zbt_cksum;
+                       bp->blk_cksum = eck->zec_cksum;
+               eck->zec_magic = ZEC_MAGIC;
+               ci->ci_func[0](data, size, &cksum);
+               eck->zec_cksum = cksum;
        } else {
                ci->ci_func[0](data, size, &bp->blk_cksum);
        }
 }
 
 int
-zio_checksum_error(zio_t *zio)
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 {
        blkptr_t *bp = zio->io_bp;
        uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
            (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
        int byteswap;
-       void *data = zio->io_data;
+       int error;
        uint64_t size = (bp == NULL ? zio->io_size :
            (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
        uint64_t offset = zio->io_offset;
-       zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+       void *data = zio->io_data;
        zio_checksum_info_t *ci = &zio_checksum_table[checksum];
        zio_cksum_t actual_cksum, expected_cksum, verifier;
 
        if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
                return (EINVAL);
 
-       if (ci->ci_zbt) {
+       if (ci->ci_eck) {
+               zio_eck_t *eck;
+
+               if (checksum == ZIO_CHECKSUM_ZILOG2) {
+                       zil_chain_t *zilc = data;
+                       uint64_t nused;
+
+                       eck = &zilc->zc_eck;
+                       if (eck->zec_magic == ZEC_MAGIC)
+                               nused = zilc->zc_nused;
+                       else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+                               nused = BSWAP_64(zilc->zc_nused);
+                       else
+                               return (ECKSUM);
+
+                       if (nused > size)
+                               return (ECKSUM);
+
+                       size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+               } else {
+                       eck = (zio_eck_t *)((char *)data + size) - 1;
+               }
+
                if (checksum == ZIO_CHECKSUM_GANG_HEADER)
                        zio_checksum_gang_verifier(&verifier, bp);
                else if (checksum == ZIO_CHECKSUM_LABEL)
@@ -176,15 +233,15 @@ zio_checksum_error(zio_t *zio)
                else
                        verifier = bp->blk_cksum;
 
-               byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
+               byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 
                if (byteswap)
                        byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
-               expected_cksum = zbt->zbt_cksum;
-               zbt->zbt_cksum = verifier;
+               expected_cksum = eck->zec_cksum;
+               eck->zec_cksum = verifier;
                ci->ci_func[byteswap](data, size, &actual_cksum);
-               zbt->zbt_cksum = expected_cksum;
+               eck->zec_cksum = expected_cksum;
 
                if (byteswap)
                        byteswap_uint64_array(&expected_cksum,
@@ -196,11 +253,22 @@ zio_checksum_error(zio_t *zio)
                ci->ci_func[byteswap](data, size, &actual_cksum);
        }
 
+       info->zbc_expected = expected_cksum;
+       info->zbc_actual = actual_cksum;
+       info->zbc_checksum_name = ci->ci_name;
+       info->zbc_byteswapped = byteswap;
+       info->zbc_injected = 0;
+       info->zbc_has_cksum = 1;
+
        if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
                return (ECKSUM);
 
-       if (zio_injection_enabled && !zio->io_error)
-               return (zio_handle_fault_injection(zio, ECKSUM));
+       if (zio_injection_enabled && !zio->io_error &&
+           (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
+
+               info->zbc_injected = 1;
+               return (error);
+       }
 
        return (0);
 }
index c563be4..f148977 100644 (file)
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/compress.h>
 #include <sys/spa.h>
@@ -51,10 +49,11 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
        {gzip_compress,         gzip_decompress,        7,      "gzip-7"},
        {gzip_compress,         gzip_decompress,        8,      "gzip-8"},
        {gzip_compress,         gzip_decompress,        9,      "gzip-9"},
+       {zle_compress,          zle_decompress,         64,     "zle"},
 };
 
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
+enum zio_compress
+zio_compress_select(enum zio_compress child, enum zio_compress parent)
 {
        ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
        ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
@@ -69,80 +68,65 @@ zio_compress_select(uint8_t child, uint8_t parent)
        return (child);
 }
 
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
-    uint64_t *destsizep, uint64_t *destbufsizep)
+size_t
+zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 {
        uint64_t *word, *word_end;
-       uint64_t ciosize, gapsize, destbufsize;
-       zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-       char *dest;
-       uint_t allzero;
+       size_t c_len, d_len, r_len;
+       zio_compress_info_t *ci = &zio_compress_table[c];
 
-       ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
-       ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+       ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+       ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
 
        /*
         * If the data is all zeroes, we don't even need to allocate
-        * a block for it.  We indicate this by setting *destsizep = 0.
+        * a block for it.  We indicate this by returning zero size.
         */
-       allzero = 1;
-       word = src;
-       word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
-       while (word < word_end) {
-               if (*word++ != 0) {
-                       allzero = 0;
+       word_end = (uint64_t *)((char *)src + s_len);
+       for (word = src; word < word_end; word++)
+               if (*word != 0)
                        break;
-               }
-       }
-       if (allzero) {
-               *destp = NULL;
-               *destsizep = 0;
-               *destbufsizep = 0;
-               return (1);
-       }
 
-       if (cpfunc == ZIO_COMPRESS_EMPTY)
+       if (word == word_end)
                return (0);
 
+       if (c == ZIO_COMPRESS_EMPTY)
+               return (s_len);
+
        /* Compress at least 12.5% */
-       destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
-       if (destbufsize == 0)
-               return (0);
-       dest = zio_buf_alloc(destbufsize);
+       d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
+       if (d_len == 0)
+               return (s_len);
 
-       ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
-           (size_t)destbufsize, ci->ci_level);
-       if (ciosize > destbufsize) {
-               zio_buf_free(dest, destbufsize);
-               return (0);
-       }
+       c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
 
-       /* Cool.  We compressed at least as much as we were hoping to. */
+       if (c_len > d_len)
+               return (s_len);
 
-       /* For security, make sure we don't write random heap crap to disk */
-       gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
-       if (gapsize != 0) {
-               bzero(dest + ciosize, gapsize);
-               ciosize += gapsize;
+       /*
+        * Cool.  We compressed at least as much as we were hoping to.
+        * For both security and repeatability, pad out the last sector.
+        */
+       r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
+       if (r_len > c_len) {
+               bzero((char *)dst + c_len, r_len - c_len);
+               c_len = r_len;
        }
 
-       ASSERT3U(ciosize, <=, destbufsize);
-       ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
-       *destp = dest;
-       *destsizep = ciosize;
-       *destbufsizep = destbufsize;
+       ASSERT3U(c_len, <=, d_len);
+       ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
 
-       return (1);
+       return (c_len);
 }
 
 int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-       void *dest, uint64_t destsize)
+zio_decompress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len)
 {
-       zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+       zio_compress_info_t *ci = &zio_compress_table[c];
 
-       ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+       if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+               return (EINVAL);
 
-       return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+       return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
 }
index f8e6880..16eaed6 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -43,8 +42,8 @@
 #include <sys/arc.h>
 #include <sys/zio_impl.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
 #include <sys/fs/zfs.h>
 
 uint32_t zio_injection_enabled;
@@ -70,8 +69,9 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
        /*
         * Check for a match against the MOS, which is based on type
         */
-       if (zb->zb_objset == 0 && record->zi_objset == 0 &&
-           record->zi_object == 0) {
+       if (zb->zb_objset == DMU_META_OBJSET &&
+           record->zi_objset == DMU_META_OBJSET &&
+           record->zi_object == DMU_META_DNODE_OBJECT) {
                if (record->zi_type == DMU_OT_NONE ||
                    type == record->zi_type)
                        return (record->zi_freq == 0 ||
@@ -96,6 +96,31 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
 }
 
 /*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+       inject_handler_t *handler;
+
+       rw_enter(&inject_lock, RW_READER);
+
+       for (handler = list_head(&inject_handlers); handler != NULL;
+           handler = list_next(&inject_handlers, handler)) {
+
+               if (spa != handler->zi_spa)
+                       continue;
+
+               if (handler->zi_record.zi_type == type &&
+                   strcmp(tag, handler->zi_record.zi_func) == 0)
+                       panic("Panic requested in function %s\n", tag);
+       }
+
+       rw_exit(&inject_lock);
+}
+
+/*
  * Determine if the I/O in question should return failure.  Returns the errno
  * to be returned to the caller.
  */
@@ -126,8 +151,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
                if (zio->io_spa != handler->zi_spa)
                        continue;
 
-               /* Ignore device errors */
-               if (handler->zi_record.zi_guid != 0)
+               /* Ignore device errors and panic injection */
+               if (handler->zi_record.zi_guid != 0 ||
+                   handler->zi_record.zi_func[0] != '\0' ||
+                   handler->zi_record.zi_duration != 0)
                        continue;
 
                /* If this handler matches, return EIO */
@@ -159,7 +186,7 @@ zio_handle_label_injection(zio_t *zio, int error)
        int label;
        int ret = 0;
 
-       if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+       if (offset >= VDEV_LABEL_START_SIZE &&
            offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
                return (0);
 
@@ -170,8 +197,10 @@ zio_handle_label_injection(zio_t *zio, int error)
                uint64_t start = handler->zi_record.zi_start;
                uint64_t end = handler->zi_record.zi_end;
 
-               /* Ignore device only faults */
-               if (handler->zi_record.zi_start == 0)
+               /* Ignore device only faults or panic injection */
+               if (handler->zi_record.zi_start == 0 ||
+                   handler->zi_record.zi_func[0] != '\0' ||
+                   handler->zi_record.zi_duration != 0)
                        continue;
 
                /*
@@ -200,13 +229,30 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
        inject_handler_t *handler;
        int ret = 0;
 
+       /*
+        * We skip over faults in the labels unless it's during
+        * device open (i.e. zio == NULL).
+        */
+       if (zio != NULL) {
+               uint64_t offset = zio->io_offset;
+
+               if (offset < VDEV_LABEL_START_SIZE ||
+                   offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+                       return (0);
+       }
+
        rw_enter(&inject_lock, RW_READER);
 
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /* Ignore label specific faults */
-               if (handler->zi_record.zi_start != 0)
+               /*
+                * Ignore label specific faults, panic injection
+                * or fake writes
+                */
+               if (handler->zi_record.zi_start != 0 ||
+                   handler->zi_record.zi_func[0] != '\0' ||
+                   handler->zi_record.zi_duration != 0)
                        continue;
 
                if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -216,6 +262,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
                                continue;
                        }
 
+                       /* Handle type specific I/O failures */
+                       if (zio != NULL &&
+                           handler->zi_record.zi_iotype != ZIO_TYPES &&
+                           handler->zi_record.zi_iotype != zio->io_type)
+                               continue;
+
                        if (handler->zi_record.zi_error == error) {
                                /*
                                 * For a failed open, pretend like the device
@@ -224,6 +276,16 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
                                if (error == ENXIO)
                                        vd->vdev_stat.vs_aux =
                                            VDEV_AUX_OPEN_FAILED;
+
+                               /*
+                                * Treat these errors as if they had been
+                                * retried so that all the appropriate stats
+                                * and FMA events are generated.
+                                */
+                               if (!handler->zi_record.zi_failfast &&
+                                   zio != NULL)
+                                       zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
                                ret = error;
                                break;
                        }
@@ -240,6 +302,84 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
 }
 
 /*
+ * Simulate hardware that ignores cache flushes.  For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+       inject_handler_t *handler;
+
+       rw_enter(&inject_lock, RW_READER);
+
+       for (handler = list_head(&inject_handlers); handler != NULL;
+           handler = list_next(&inject_handlers, handler)) {
+
+               /* Ignore errors not destined for this pool */
+               if (zio->io_spa != handler->zi_spa)
+                       continue;
+
+               if (handler->zi_record.zi_duration == 0)
+                       continue;
+
+               /*
+                * Positive duration implies # of seconds, negative
+                * a number of txgs
+                */
+               if (handler->zi_record.zi_timer == 0) {
+                       if (handler->zi_record.zi_duration > 0)
+                               handler->zi_record.zi_timer = ddi_get_lbolt64();
+                       else
+                               handler->zi_record.zi_timer = zio->io_txg;
+               }
+
+               /* Have a "problem" writing 60% of the time */
+               if (spa_get_random(100) < 60)
+                       zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+               break;
+       }
+
+       rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+       inject_handler_t *handler;
+
+       if (zio_injection_enabled == 0)
+               return;
+
+       rw_enter(&inject_lock, RW_READER);
+
+       for (handler = list_head(&inject_handlers); handler != NULL;
+           handler = list_next(&inject_handlers, handler)) {
+
+               /* Ignore errors not destined for this pool */
+               if (spa != handler->zi_spa)
+                       continue;
+
+               if (handler->zi_record.zi_duration == 0)
+                       continue;
+
+               if (handler->zi_record.zi_duration > 0) {
+                       VERIFY(handler->zi_record.zi_timer == 0 ||
+                           handler->zi_record.zi_timer +
+                           handler->zi_record.zi_duration * hz >
+                           ddi_get_lbolt64());
+               } else {
+                       /* duration is negative so the subtraction here adds */
+                       VERIFY(handler->zi_record.zi_timer == 0 ||
+                           handler->zi_record.zi_timer -
+                           handler->zi_record.zi_duration >=
+                           spa_syncing_txg(spa));
+               }
+       }
+
+       rw_exit(&inject_lock);
+}
+
+/*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
  * which is the switch to trigger all fault injection.
diff --git a/module/zfs/zle.c b/module/zfs/zle.c
new file mode 100644 (file)
index 0000000..13c5673
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding.  This is a fast and simple algorithm to eliminate
+ * runs of zeroes.  Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values.  If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+       uchar_t *src = s_start;
+       uchar_t *dst = d_start;
+       uchar_t *s_end = src + s_len;
+       uchar_t *d_end = dst + d_len;
+
+       while (src < s_end && dst < d_end - 1) {
+               uchar_t *first = src;
+               uchar_t *len = dst++;
+               if (src[0] == 0) {
+                       uchar_t *last = src + (256 - n);
+                       while (src < MIN(last, s_end) && src[0] == 0)
+                               src++;
+                       *len = src - first - 1 + n;
+               } else {
+                       uchar_t *last = src + n;
+                       if (d_end - dst < n)
+                               break;
+                       while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+                               *dst++ = *src++;
+                       if (src[0])
+                               *dst++ = *src++;
+                       *len = src - first - 1;
+               }
+       }
+       return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+       uchar_t *src = s_start;
+       uchar_t *dst = d_start;
+       uchar_t *s_end = src + s_len;
+       uchar_t *d_end = dst + d_len;
+
+       while (src < s_end && dst < d_end) {
+               int len = 1 + *src++;
+               if (len <= n) {
+                       while (len-- != 0)
+                               *dst++ = *src++;
+               } else {
+                       len -= n;
+                       while (len-- != 0)
+                               *dst++ = 0;
+               }
+       }
+       return (dst == d_end ? 0 : -1);
+}
index 1b74b7e..474c49a 100755 (executable)
@@ -11,7 +11,7 @@ RELEASE=$1
 PROG=update-zfs.sh
 REMOTE_DOC_FILE=man-sunosman-20090930.tar.bz2
 REMOTE_DOC=http://dlc.sun.com/osol/man/downloads/current/${REMOTE_DOC_FILE}
-REMOTE_SRC=http://dlc.sun.com/osol/on/downloads/${RELEASE}/on-src.tar.bz2
+REMOTE_SRC=ssh://anon@hg.opensolaris.org/hg/onnv/onnv-gate
 
 die() {
        rm -Rf ${SRC}
@@ -29,10 +29,10 @@ if [ `basename $DST` != "scripts" ]; then
 fi
 
 if [ ! "$RELEASE" ]; then
-       die "Must specify ZFS release build"
+       die "Must specify ZFS release build, i.e. 'onnv_141'"
 fi
 
-SRC=`mktemp -d /tmp/os-${RELEASE}.XXXXXXXXXX`
+SRC=`mktemp -d /tmp/onnv-gate.XXXXXXXXXX`
 DST=`dirname $DST`
 
 echo "----------------------------------------------------------------"
@@ -41,28 +41,27 @@ echo "Remote Docs:   ${REMOTE_DOC}"
 echo "Local Source:  ${SRC}"
 echo "Local Dest:    ${DST}"
 echo
-echo "------------- Fetching OpenSolaris ${RELEASE} archive ----------------"
-wget -q ${REMOTE_SRC} -P ${SRC} ||
-       die "Error 'wget ${REMOTE_SRC}'"
-
+echo "------------- Fetching OpenSolaris mercurial repo ----------------"
+pushd ${SRC}
+hg clone ${REMOTE_SRC} || die "Error cloning OpenSolaris mercurial repo"
+pushd onnv-gate
+hg update -C ${RELEASE} || die "Error unknown release ${RELEASE}"
+popd
+popd
 echo "------------- Fetching OpenSolaris documentation ---------------"
 wget -q ${REMOTE_DOC} -P ${SRC} ||
        die "Error 'wget ${REMOTE_DOC}'"
 
-echo "------------- Unpacking OpenSolaris ${RELEASE} archive ---------------"
-tar -xjf ${SRC}/on-src.tar.bz2 -C ${SRC} ||
-       die "Error 'tar -xjf ${SRC}/on-src.tar.bz2 -C ${SRC}'"
-
 echo "------------- Unpacking OpenSolaris documentation --------------"
 tar -xjf ${SRC}/${REMOTE_DOC_FILE} -C ${SRC} ||
        die "Error 'tar -xjf ${SRC}/${REMOTE_DOC_FILE} -C ${SRC}'"
 
-SRC_LIB=${SRC}/usr/src/lib
-SRC_CMD=${SRC}/usr/src/cmd
-SRC_CM=${SRC}/usr/src/common
-SRC_UTS=${SRC}/usr/src/uts
-SRC_UCM=${SRC}/usr/src/uts/common
-SRC_ZLIB=${SRC}/usr/src/uts/common/fs/zfs
+SRC_LIB=${SRC}/onnv-gate/usr/src/lib
+SRC_CMD=${SRC}/onnv-gate/usr/src/cmd
+SRC_CM=${SRC}/onnv-gate/usr/src/common
+SRC_UTS=${SRC}/onnv-gate/usr/src/uts
+SRC_UCM=${SRC}/onnv-gate/usr/src/uts/common
+SRC_ZLIB=${SRC}/onnv-gate/usr/src/uts/common/fs/zfs
 SRC_MAN=${SRC}/man
 
 DST_MOD=${DST}/module
@@ -177,6 +176,6 @@ cp ${SRC_MAN}/man1m/zpool.1m                        ${DST_MAN}/man8/zpool.8
 cp ${SRC_MAN}/man1m/zdb.1m                     ${DST_MAN}/man8/zdb.8
 chmod -R 644 ${DST_MAN}/man8/*
 
-echo "${REMOTE_SRC}" >${DST}/ZFS.RELEASE
+echo "${REMOTE_SRC}/${RELEASE}" >${DST}/ZFS.RELEASE
 
 rm -Rf ${SRC}