Merge branch 'feature-flags'
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 8 Jan 2013 18:59:05 +0000 (10:59 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 8 Jan 2013 18:59:08 +0000 (10:59 -0800)
Feature flags support for ZFS ported from Illumos.  Only minimal
compatibility changes were made where required to accomidate Linux.
For a detailed description of feature flags see original proposal
on zfs-discuss.  They are conceptually very similar to Linux's
ext[234] style of feature flags.

http://lists.freebsd.org/pipermail/freebsd-fs/2011-May/011568.html

NOTE: This branch updates the default pool version for new pools
from 28 to 5000.  Version 28 pools may still be created for
compatibility with Solaris by using the '-o version=28' option.

$ zpool create -o version=28 ...

Existing pools must be manually upgraded using 'zpool upgrade'.

$ zpool upgrade ...

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #778

78 files changed:
cmd/Makefile.am
cmd/zdb/zdb.c
cmd/zhack/.gitignore [new file with mode: 0644]
cmd/zhack/Makefile.am [new file with mode: 0644]
cmd/zhack/zhack.c [new file with mode: 0644]
cmd/zpool/zpool_main.c
cmd/ztest/ztest.c
configure.ac
include/Makefile.am
include/libzfs.h
include/sys/Makefile.am
include/sys/arc.h
include/sys/bpobj.h
include/sys/bptree.h [new file with mode: 0644]
include/sys/dmu.h
include/sys/dmu_traverse.h
include/sys/dsl_dataset.h
include/sys/dsl_pool.h
include/sys/dsl_scan.h
include/sys/fs/zfs.h
include/sys/nvpair.h
include/sys/spa.h
include/sys/spa_impl.h
include/sys/txg.h
include/sys/vdev.h
include/sys/vdev_impl.h
include/sys/zap.h
include/sys/zfeature.h [new file with mode: 0644]
include/sys/zil.h
include/sys/zil_impl.h
include/sys/zio.h
include/zfeature_common.h [new file with mode: 0644]
lib/libnvpair/Makefile.am
lib/libnvpair/libnvpair.c
lib/libzfs/libzfs_config.c
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_status.c
lib/libzfs/libzfs_util.c
lib/libzpool/Makefile.am
lib/libzpool/kernel.c
man/man5/zpool-features.5 [new file with mode: 0644]
man/man8/zpool.8
module/nvpair/Makefile.in
module/nvpair/fnvpair.c [new file with mode: 0644]
module/zcommon/zpool_prop.c
module/zfs/Makefile.in
module/zfs/arc.c
module/zfs/bpobj.c
module/zfs/bptree.c [new file with mode: 0644]
module/zfs/dbuf.c
module/zfs/ddt.c
module/zfs/dmu.c
module/zfs/dmu_send.c
module/zfs/dmu_traverse.c
module/zfs/dmu_tx.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
module/zfs/dsl_dataset.c
module/zfs/dsl_deadlist.c
module/zfs/dsl_deleg.c
module/zfs/dsl_dir.c
module/zfs/dsl_pool.c
module/zfs/dsl_scan.c
module/zfs/sa.c
module/zfs/spa.c
module/zfs/spa_config.c
module/zfs/spa_misc.c
module/zfs/txg.c
module/zfs/vdev.c
module/zfs/vdev_label.c
module/zfs/zap.c
module/zfs/zap_micro.c
module/zfs/zfeature.c [new file with mode: 0644]
module/zfs/zfeature_common.c [new file with mode: 0644]
module/zfs/zfs_ioctl.c
module/zfs/zil.c
module/zfs/zio.c

index 478da26..afdba34 100644 (file)
@@ -1,2 +1,2 @@
-SUBDIRS  = zfs zpool zdb zinject zstreamdump ztest zpios mount_zfs
+SUBDIRS  = zfs zpool zdb zhack zinject zstreamdump ztest zpios mount_zfs
 SUBDIRS += zpool_layout zvol_id zpool_id vdev_id
index ce6318e..de4ac51 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <stdio.h>
@@ -54,6 +56,7 @@
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/zfeature.h>
 #undef ZFS_MAXNAMELEN
 #include <libzfs.h>
 
@@ -62,7 +65,8 @@
 #define        ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
     zio_checksum_table[(idx)].ci_name : "UNKNOWN")
 #define        ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
-    dmu_ot[(idx)].ot_name : "UNKNOWN")
+    dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \
+    dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
 #define        ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES)
 
 #ifndef lint
@@ -1099,7 +1103,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 
        ASSERT(size == sizeof (*ds));
        crtime = ds->ds_creation_time;
-       zdb_nicenum(ds->ds_used_bytes, used);
+       zdb_nicenum(ds->ds_referenced_bytes, used);
        zdb_nicenum(ds->ds_compressed_bytes, compressed);
        zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
        zdb_nicenum(ds->ds_unique_bytes, unique);
@@ -1143,6 +1147,44 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 
 /* ARGSUSED */
 static int
+dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       char blkbuf[BP_SPRINTF_LEN];
+
+       if (bp->blk_birth != 0) {
+               sprintf_blkptr(blkbuf, bp);
+               (void) printf("\t%s\n", blkbuf);
+       }
+       return (0);
+}
+
+static void
+dump_bptree(objset_t *os, uint64_t obj, char *name)
+{
+       char bytes[32];
+       bptree_phys_t *bt;
+       dmu_buf_t *db;
+
+       if (dump_opt['d'] < 3)
+               return;
+
+       VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+       bt = db->db_data;
+       zdb_nicenum(bt->bt_bytes, bytes);
+       (void) printf("\n    %s: %llu datasets, %s\n",
+           name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
+       dmu_buf_rele(db, FTAG);
+
+       if (dump_opt['d'] < 5)
+               return;
+
+       (void) printf("\n");
+
+       (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
+}
+
+/* ARGSUSED */
+static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
        char blkbuf[BP_SPRINTF_LEN];
@@ -1888,11 +1930,13 @@ typedef struct zdb_blkstats {
  */
 #define        ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
 #define        ZDB_OT_DITTO    (DMU_OT_NUMTYPES + 1)
-#define        ZDB_OT_TOTAL    (DMU_OT_NUMTYPES + 2)
+#define        ZDB_OT_OTHER    (DMU_OT_NUMTYPES + 2)
+#define        ZDB_OT_TOTAL    (DMU_OT_NUMTYPES + 3)
 
 static char *zdb_ot_extname[] = {
        "deferred free",
        "dedup ditto",
+       "other",
        "Total",
 };
 
@@ -1974,9 +2018,10 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 
        type = BP_GET_TYPE(bp);
 
-       zdb_count_block(zcb, zilog, bp, type);
+       zdb_count_block(zcb, zilog, bp,
+           (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
-       is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
+       is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
        if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
                int ioerr;
@@ -2207,6 +2252,12 @@ dump_block_stats(spa_t *spa)
            count_block_cb, &zcb, NULL);
        (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
            count_block_cb, &zcb, NULL);
+       if (spa_feature_is_active(spa,
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+               VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
+                   spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
+                   &zcb, NULL));
+       }
 
        if (dump_opt['c'] > 1)
                flags |= TRAVERSE_PREFETCH_DATA;
@@ -2383,7 +2434,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        }
 
        if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
-           BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+           BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
                return (0);
 
        ddt_key_fill(&zdde_search.zdde_key, bp);
@@ -2491,7 +2542,14 @@ dump_zpool(spa_t *spa)
                        dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees");
                        if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
                                dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
-                                   "Pool frees");
+                                   "Pool snapshot frees");
+                       }
+
+                       if (spa_feature_is_active(spa,
+                           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                               dump_bptree(spa->spa_meta_objset,
+                                   spa->spa_dsl_pool->dp_bptree_obj,
+                                   "Pool dataset frees");
                        }
                        dump_dtl(spa->spa_root_vdev, 0);
                }
diff --git a/cmd/zhack/.gitignore b/cmd/zhack/.gitignore
new file mode 100644 (file)
index 0000000..763a188
--- /dev/null
@@ -0,0 +1 @@
+/zhack
diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am
new file mode 100644 (file)
index 0000000..47da245
--- /dev/null
@@ -0,0 +1,18 @@
+include $(top_srcdir)/config/Rules.am
+
+DEFAULT_INCLUDES += \
+       -I$(top_srcdir)/include \
+       -I$(top_srcdir)/lib/libspl/include
+
+sbin_PROGRAMS = zhack
+
+zhack_SOURCES = \
+       $(top_srcdir)/cmd/zhack/zhack.c
+
+zhack_LDADD = \
+       $(top_builddir)/lib/libnvpair/libnvpair.la \
+       $(top_builddir)/lib/libuutil/libuutil.la \
+       $(top_builddir)/lib/libzpool/libzpool.la \
+       $(top_builddir)/lib/libzfs/libzfs.la
+
+zhack_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID)
diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c
new file mode 100644 (file)
index 0000000..b2cf815
--- /dev/null
@@ -0,0 +1,533 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+/*
+ * zhack is a debugging tool that can write changes to ZFS pool using libzpool
+ * for testing purposes. Altering pools with zhack is unsupported and may
+ * result in corrupted pools.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/dsl_synctask.h>
+#include <sys/vdev.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfeature.h>
+#undef ZFS_MAXNAMELEN
+#include <libzfs.h>
+
+extern boolean_t zfeature_checks_disable;
+
+const char cmdname[] = "zhack";
+libzfs_handle_t *g_zfs;
+static importargs_t g_importargs;
+static char *g_pool;
+static boolean_t g_readonly;
+
+static void
+usage(void)
+{
+       (void) fprintf(stderr,
+           "Usage: %s [-c cachefile] [-d dir] <subcommand> <args> ...\n"
+           "where <subcommand> <args> is one of the following:\n"
+           "\n", cmdname);
+
+       (void) fprintf(stderr,
+           "    feature stat <pool>\n"
+           "        print information about enabled features\n"
+           "    feature enable [-d desc] <pool> <feature>\n"
+           "        add a new enabled feature to the pool\n"
+           "        -d <desc> sets the feature's description\n"
+           "    feature ref [-md] <pool> <feature>\n"
+           "        change the refcount on the given feature\n"
+           "        -d decrease instead of increase the refcount\n"
+           "        -m add the feature to the label if increasing refcount\n"
+           "\n"
+           "    <feature> : should be a feature guid\n");
+       exit(1);
+}
+
+
+static void
+fatal(const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       (void) fprintf(stderr, "%s: ", cmdname);
+       (void) vfprintf(stderr, fmt, ap);
+       va_end(ap);
+       (void) fprintf(stderr, "\n");
+
+       exit(1);
+}
+
+/* ARGSUSED */
+static int
+space_delta_cb(dmu_object_type_t bonustype, void *data,
+    uint64_t *userp, uint64_t *groupp)
+{
+       /*
+        * Is it a valid type of object to track?
+        */
+       if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+               return (ENOENT);
+       (void) fprintf(stderr, "modifying object that needs user accounting");
+       abort();
+       /* NOTREACHED */
+}
+
+/*
+ * Target is the dataset whose pool we want to open.
+ */
+static void
+import_pool(const char *target, boolean_t readonly)
+{
+       nvlist_t *config;
+       nvlist_t *pools;
+       int error;
+       char *sepp;
+       spa_t *spa;
+       nvpair_t *elem;
+       nvlist_t *props;
+       const char *name;
+
+       kernel_init(readonly ? FREAD : (FREAD | FWRITE));
+       g_zfs = libzfs_init();
+       ASSERT(g_zfs != NULL);
+
+       dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb);
+
+       g_readonly = readonly;
+
+       /*
+        * If we only want readonly access, it's OK if we find
+        * a potentially-active (ie, imported into the kernel) pool from the
+        * default cachefile.
+        */
+       if (readonly && spa_open(target, &spa, FTAG) == 0) {
+               spa_close(spa, FTAG);
+               return;
+       }
+
+       g_importargs.unique = B_TRUE;
+       g_importargs.can_be_active = readonly;
+       g_pool = strdup(target);
+       if ((sepp = strpbrk(g_pool, "/@")) != NULL)
+               *sepp = '\0';
+       g_importargs.poolname = g_pool;
+       pools = zpool_search_import(g_zfs, &g_importargs);
+
+       if (pools == NULL || nvlist_next_nvpair(pools, NULL) == NULL) {
+               if (!g_importargs.can_be_active) {
+                       g_importargs.can_be_active = B_TRUE;
+                       if (zpool_search_import(g_zfs, &g_importargs) != NULL ||
+                           spa_open(target, &spa, FTAG) == 0) {
+                               fatal("cannot import '%s': pool is active; run "
+                                   "\"zpool export %s\" first\n",
+                                   g_pool, g_pool);
+                       }
+               }
+
+               fatal("cannot import '%s': no such pool available\n", g_pool);
+       }
+
+       elem = nvlist_next_nvpair(pools, NULL);
+       name = nvpair_name(elem);
+       VERIFY(nvpair_value_nvlist(elem, &config) == 0);
+
+       props = NULL;
+       if (readonly) {
+               VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+               VERIFY(nvlist_add_uint64(props,
+                   zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0);
+       }
+
+       zfeature_checks_disable = B_TRUE;
+       error = spa_import(name, config, props, ZFS_IMPORT_NORMAL);
+       zfeature_checks_disable = B_FALSE;
+       if (error == EEXIST)
+               error = 0;
+
+       if (error)
+               fatal("can't import '%s': %s", name, strerror(error));
+}
+
+static void
+zhack_spa_open(const char *target, boolean_t readonly, void *tag, spa_t **spa)
+{
+       int err;
+
+       import_pool(target, readonly);
+
+       zfeature_checks_disable = B_TRUE;
+       err = spa_open(target, spa, tag);
+       zfeature_checks_disable = B_FALSE;
+
+       if (err != 0)
+               fatal("cannot open '%s': %s", target, strerror(err));
+       if (spa_version(*spa) < SPA_VERSION_FEATURES) {
+               fatal("'%s' has version %d, features not enabled", target,
+                   (int)spa_version(*spa));
+       }
+}
+
+static void
+dump_obj(objset_t *os, uint64_t obj, const char *name)
+{
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       (void) printf("%s_obj:\n", name);
+
+       for (zap_cursor_init(&zc, os, obj);
+           zap_cursor_retrieve(&zc, &za) == 0;
+           zap_cursor_advance(&zc)) {
+               if (za.za_integer_length == 8) {
+                       ASSERT(za.za_num_integers == 1);
+                       (void) printf("\t%s = %llu\n",
+                           za.za_name, (u_longlong_t)za.za_first_integer);
+               } else {
+                       ASSERT(za.za_integer_length == 1);
+                       char val[1024];
+                       VERIFY(zap_lookup(os, obj, za.za_name,
+                           1, sizeof (val), val) == 0);
+                       (void) printf("\t%s = %s\n", za.za_name, val);
+               }
+       }
+       zap_cursor_fini(&zc);
+}
+
+static void
+dump_mos(spa_t *spa)
+{
+       nvlist_t *nv = spa->spa_label_features;
+       nvpair_t *pair;
+
+       (void) printf("label config:\n");
+       for (pair = nvlist_next_nvpair(nv, NULL);
+           pair != NULL;
+           pair = nvlist_next_nvpair(nv, pair)) {
+               (void) printf("\t%s\n", nvpair_name(pair));
+       }
+}
+
+static void
+zhack_do_feature_stat(int argc, char **argv)
+{
+       spa_t *spa;
+       objset_t *os;
+       char *target;
+
+       argc--;
+       argv++;
+
+       if (argc < 1) {
+               (void) fprintf(stderr, "error: missing pool name\n");
+               usage();
+       }
+       target = argv[0];
+
+       zhack_spa_open(target, B_TRUE, FTAG, &spa);
+       os = spa->spa_meta_objset;
+
+       dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
+       dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
+       dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
+       dump_mos(spa);
+
+       spa_close(spa, FTAG);
+}
+
+static void
+feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       zfeature_info_t *feature = arg2;
+
+       spa_feature_enable(spa, feature, tx);
+}
+
+static void
+zhack_do_feature_enable(int argc, char **argv)
+{
+       char c;
+       char *desc, *target;
+       spa_t *spa;
+       objset_t *mos;
+       zfeature_info_t feature;
+       zfeature_info_t *nodeps[] = { NULL };
+
+       /*
+        * Features are not added to the pool's label until their refcounts
+        * are incremented, so fi_mos can just be left as false for now.
+        */
+       desc = NULL;
+       feature.fi_uname = "zhack";
+       feature.fi_mos = B_FALSE;
+       feature.fi_can_readonly = B_FALSE;
+       feature.fi_depends = nodeps;
+
+       optind = 1;
+       while ((c = getopt(argc, argv, "rmd:")) != -1) {
+               switch (c) {
+               case 'r':
+                       feature.fi_can_readonly = B_TRUE;
+                       break;
+               case 'd':
+                       desc = strdup(optarg);
+                       break;
+               default:
+                       usage();
+                       break;
+               }
+       }
+
+       if (desc == NULL)
+               desc = strdup("zhack injected");
+       feature.fi_desc = desc;
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 2) {
+               (void) fprintf(stderr, "error: missing feature or pool name\n");
+               usage();
+       }
+       target = argv[0];
+       feature.fi_guid = argv[1];
+
+       if (!zfeature_is_valid_guid(feature.fi_guid))
+               fatal("invalid feature guid: %s", feature.fi_guid);
+
+       zhack_spa_open(target, B_FALSE, FTAG, &spa);
+       mos = spa->spa_meta_objset;
+
+       if (0 == zfeature_lookup_guid(feature.fi_guid, NULL))
+               fatal("'%s' is a real feature, will not enable");
+       if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
+               fatal("feature already enabled: %s", feature.fi_guid);
+
+       VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL,
+           feature_enable_sync, spa, &feature, 5));
+
+       spa_close(spa, FTAG);
+
+       free(desc);
+}
+
+static void
+feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       zfeature_info_t *feature = arg2;
+
+       spa_feature_incr(spa, feature, tx);
+}
+
+static void
+feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       zfeature_info_t *feature = arg2;
+
+       spa_feature_decr(spa, feature, tx);
+}
+
+static void
+zhack_do_feature_ref(int argc, char **argv)
+{
+       char c;
+       char *target;
+       boolean_t decr = B_FALSE;
+       spa_t *spa;
+       objset_t *mos;
+       zfeature_info_t feature;
+       zfeature_info_t *nodeps[] = { NULL };
+
+       /*
+        * fi_desc does not matter here because it was written to disk
+        * when the feature was enabled, but we need to properly set the
+        * feature for read or write based on the information we read off
+        * disk later.
+        */
+       feature.fi_uname = "zhack";
+       feature.fi_mos = B_FALSE;
+       feature.fi_desc = NULL;
+       feature.fi_depends = nodeps;
+
+       optind = 1;
+       while ((c = getopt(argc, argv, "md")) != -1) {
+               switch (c) {
+               case 'm':
+                       feature.fi_mos = B_TRUE;
+                       break;
+               case 'd':
+                       decr = B_TRUE;
+                       break;
+               default:
+                       usage();
+                       break;
+               }
+       }
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 2) {
+               (void) fprintf(stderr, "error: missing feature or pool name\n");
+               usage();
+       }
+       target = argv[0];
+       feature.fi_guid = argv[1];
+
+       if (!zfeature_is_valid_guid(feature.fi_guid))
+               fatal("invalid feature guid: %s", feature.fi_guid);
+
+       zhack_spa_open(target, B_FALSE, FTAG, &spa);
+       mos = spa->spa_meta_objset;
+
+       if (0 == zfeature_lookup_guid(feature.fi_guid, NULL))
+               fatal("'%s' is a real feature, will not change refcount");
+
+       if (0 == zap_contains(mos, spa->spa_feat_for_read_obj,
+           feature.fi_guid)) {
+               feature.fi_can_readonly = B_FALSE;
+       } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj,
+           feature.fi_guid)) {
+               feature.fi_can_readonly = B_TRUE;
+       } else {
+               fatal("feature is not enabled: %s", feature.fi_guid);
+       }
+
+       if (decr && !spa_feature_is_active(spa, &feature))
+               fatal("feature refcount already 0: %s", feature.fi_guid);
+
+       VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL,
+           decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5));
+
+       spa_close(spa, FTAG);
+}
+
+static int
+zhack_do_feature(int argc, char **argv)
+{
+       char *subcommand;
+
+       argc--;
+       argv++;
+       if (argc == 0) {
+               (void) fprintf(stderr,
+                   "error: no feature operation specified\n");
+               usage();
+       }
+
+       subcommand = argv[0];
+       if (strcmp(subcommand, "stat") == 0) {
+               zhack_do_feature_stat(argc, argv);
+       } else if (strcmp(subcommand, "enable") == 0) {
+               zhack_do_feature_enable(argc, argv);
+       } else if (strcmp(subcommand, "ref") == 0) {
+               zhack_do_feature_ref(argc, argv);
+       } else {
+               (void) fprintf(stderr, "error: unknown subcommand: %s\n",
+                   subcommand);
+               usage();
+       }
+
+       return (0);
+}
+
+#define        MAX_NUM_PATHS 1024
+
+int
+main(int argc, char **argv)
+{
+       extern void zfs_prop_init(void);
+
+       char *path[MAX_NUM_PATHS];
+       const char *subcommand;
+       int rv = 0;
+       char c;
+
+       g_importargs.path = path;
+
+       dprintf_setup(&argc, argv);
+       zfs_prop_init();
+
+       while ((c = getopt(argc, argv, "c:d:")) != -1) {
+               switch (c) {
+               case 'c':
+                       g_importargs.cachefile = optarg;
+                       break;
+               case 'd':
+                       assert(g_importargs.paths < MAX_NUM_PATHS);
+                       g_importargs.path[g_importargs.paths++] = optarg;
+                       break;
+               default:
+                       usage();
+                       break;
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+       optind = 1;
+
+       if (argc == 0) {
+               (void) fprintf(stderr, "error: no command specified\n");
+               usage();
+       }
+
+       subcommand = argv[0];
+
+       if (strcmp(subcommand, "feature") == 0) {
+               rv = zhack_do_feature(argc, argv);
+       } else {
+               (void) fprintf(stderr, "error: unknown subcommand: %s\n",
+                   subcommand);
+               usage();
+       }
+
+       if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_TRUE) != 0) {
+               fatal("pool export failed; "
+                   "changes may not be committed to disk\n");
+       }
+
+       libzfs_fini(g_zfs);
+       kernel_fini();
+
+       return (rv);
+}
index cb0535a..8da4620 100644 (file)
@@ -54,6 +54,7 @@
 
 #include "zpool_util.h"
 #include "zfs_comutil.h"
+#include "zfeature_common.h"
 
 #include "statcommon.h"
 
@@ -208,7 +209,7 @@ get_usage(zpool_help_t idx) {
        case HELP_CLEAR:
                return (gettext("\tclear [-nF] <pool> [device]\n"));
        case HELP_CREATE:
-               return (gettext("\tcreate [-fn] [-o property=value] ... \n"
+               return (gettext("\tcreate [-fnd] [-o property=value] ... \n"
                    "\t    [-O file-system-property=value] ... \n"
                    "\t    [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
        case HELP_DESTROY:
@@ -341,6 +342,12 @@ usage(boolean_t requested)
                /* Iterate over all properties */
                (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
                    ZFS_TYPE_POOL);
+
+               (void) fprintf(fp, "\t%-15s   ", "feature@...");
+               (void) fprintf(fp, "YES   disabled | enabled | active\n");
+
+               (void) fprintf(fp, gettext("\nThe feature@ properties must be "
+                   "appended with a feature name.\nSee zpool-features(5).\n"));
        }
 
        /*
@@ -384,6 +391,18 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
        }
 }
 
+static boolean_t
+prop_list_contains_feature(nvlist_t *proplist)
+{
+       nvpair_t *nvp;
+       for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
+           nvp = nvlist_next_nvpair(proplist, nvp)) {
+               if (zpool_prop_feature(nvpair_name(nvp)))
+                       return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
 /*
  * Add a property pair (name, string-value) into a property nvlist.
  */
@@ -407,12 +426,34 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,
        proplist = *props;
 
        if (poolprop) {
-               if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) {
+               const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
+
+               if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL &&
+                   !zpool_prop_feature(propname)) {
                        (void) fprintf(stderr, gettext("property '%s' is "
                            "not a valid pool property\n"), propname);
                        return (2);
                }
-               normnm = zpool_prop_to_name(prop);
+
+               /*
+                * feature@ properties and version should not be specified
+                * at the same time.
+                */
+               if ((prop == ZPROP_INVAL && zpool_prop_feature(propname) &&
+                   nvlist_exists(proplist, vname)) ||
+                   (prop == ZPOOL_PROP_VERSION &&
+                   prop_list_contains_feature(proplist))) {
+                       (void) fprintf(stderr, gettext("'feature@' and "
+                           "'version' properties cannot be specified "
+                           "together\n"));
+                       return (2);
+               }
+
+
+               if (zpool_prop_feature(propname))
+                       normnm = propname;
+               else
+                       normnm = zpool_prop_to_name(prop);
        } else {
                if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
                        normnm = zfs_prop_to_name(fprop);
@@ -601,7 +642,7 @@ zpool_do_remove(int argc, char **argv)
 }
 
 /*
- * zpool create [-fn] [-o property=value] ...
+ * zpool create [-fnd] [-o property=value] ...
  *             [-O file-system-property=value] ...
  *             [-R root] [-m mountpoint] <pool> <dev> ...
  *
@@ -610,8 +651,10 @@ zpool_do_remove(int argc, char **argv)
  *             were to be created.
  *      -R     Create a pool under an alternate root
  *      -m     Set default mountpoint for the root dataset.  By default it's
- *             '/<pool>'
+ *             '/<pool>'
  *     -o      Set property=value.
+ *     -d      Don't automatically enable all supported pool features
+ *             (individual features can be enabled with -o).
  *     -O      Set fsproperty=value in the pool's root file system
  *
  * Creates the named pool according to the given vdev specification.  The
@@ -624,6 +667,7 @@ zpool_do_create(int argc, char **argv)
 {
        boolean_t force = B_FALSE;
        boolean_t dryrun = B_FALSE;
+       boolean_t enable_all_pool_feat = B_TRUE;
        int c;
        nvlist_t *nvroot = NULL;
        char *poolname;
@@ -635,7 +679,7 @@ zpool_do_create(int argc, char **argv)
        char *propval;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":fnR:m:o:O:")) != -1) {
+       while ((c = getopt(argc, argv, ":fndR:m:o:O:")) != -1) {
                switch (c) {
                case 'f':
                        force = B_TRUE;
@@ -643,6 +687,9 @@ zpool_do_create(int argc, char **argv)
                case 'n':
                        dryrun = B_TRUE;
                        break;
+               case 'd':
+                       enable_all_pool_feat = B_FALSE;
+                       break;
                case 'R':
                        altroot = optarg;
                        if (add_prop_list(zpool_prop_to_name(
@@ -670,6 +717,21 @@ zpool_do_create(int argc, char **argv)
 
                        if (add_prop_list(optarg, propval, &props, B_TRUE))
                                goto errout;
+
+                       /*
+                        * If the user is creating a pool that doesn't support
+                        * feature flags, don't enable any features.
+                        */
+                       if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
+                               char *end;
+                               u_longlong_t ver;
+
+                               ver = strtoull(propval, &end, 10);
+                               if (*end == '\0' &&
+                                   ver < SPA_VERSION_FEATURES) {
+                                       enable_all_pool_feat = B_FALSE;
+                               }
+                       }
                        break;
                case 'O':
                        if ((propval = strchr(optarg, '=')) == NULL) {
@@ -735,7 +797,6 @@ zpool_do_create(int argc, char **argv)
                goto errout;
        }
 
-
        if (altroot != NULL && altroot[0] != '/') {
                (void) fprintf(stderr, gettext("invalid alternate root '%s': "
                    "must be an absolute path\n"), altroot);
@@ -817,6 +878,27 @@ zpool_do_create(int argc, char **argv)
                /*
                 * Hand off to libzfs.
                 */
+               if (enable_all_pool_feat) {
+                       int i;
+                       for (i = 0; i < SPA_FEATURES; i++) {
+                               char propname[MAXPATHLEN];
+                               zfeature_info_t *feat = &spa_feature_table[i];
+
+                               (void) snprintf(propname, sizeof (propname),
+                                   "feature@%s", feat->fi_uname);
+
+                               /*
+                                * Skip feature if user specified it manually
+                                * on the command line.
+                                */
+                               if (nvlist_exists(props, propname))
+                                       continue;
+
+                               if (add_prop_list(propname, ZFS_FEATURE_ENABLED,
+                                   &props, B_TRUE) != 0)
+                                       goto errout;
+                       }
+               }
                if (zpool_create(g_zfs, poolname,
                    nvroot, props, fsprops) == 0) {
                        zfs_handle_t *pool = zfs_open(g_zfs, poolname,
@@ -1148,6 +1230,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
                        (void) printf(gettext("newer version"));
                        break;
 
+               case VDEV_AUX_UNSUP_FEAT:
+                       (void) printf(gettext("unsupported feature(s)"));
+                       break;
+
                case VDEV_AUX_SPARED:
                        verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
                            &cb.cb_guid) == 0);
@@ -1265,6 +1351,10 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
                        (void) printf(gettext("newer version"));
                        break;
 
+               case VDEV_AUX_UNSUP_FEAT:
+                       (void) printf(gettext("unsupported feature(s)"));
+                       break;
+
                case VDEV_AUX_ERR_EXCEEDED:
                        (void) printf(gettext("too many errors"));
                        break;
@@ -1422,8 +1512,8 @@ show_import(nvlist_t *config)
                break;
 
        case ZPOOL_STATUS_VERSION_OLDER:
-               (void) printf(gettext(" status: The pool is formatted using an "
-                   "older on-disk version.\n"));
+               (void) printf(gettext(" status: The pool is formatted using a "
+                   "legacy on-disk version.\n"));
                break;
 
        case ZPOOL_STATUS_VERSION_NEWER:
@@ -1431,6 +1521,25 @@ show_import(nvlist_t *config)
                    "incompatible version.\n"));
                break;
 
+       case ZPOOL_STATUS_FEAT_DISABLED:
+               (void) printf(gettext(" status: Some supported features are "
+                   "not enabled on the pool.\n"));
+               break;
+
+       case ZPOOL_STATUS_UNSUP_FEAT_READ:
+               (void) printf(gettext("status: The pool uses the following "
+                   "feature(s) not supported on this sytem:\n"));
+               zpool_print_unsup_feat(config);
+               break;
+
+       case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+               (void) printf(gettext("status: The pool can only be accessed "
+                   "in read-only mode on this system. It\n\tcannot be "
+                   "accessed in read-write mode because it uses the "
+                   "following\n\tfeature(s) not supported on this system:\n"));
+               zpool_print_unsup_feat(config);
+               break;
+
        case ZPOOL_STATUS_HOSTID_MISMATCH:
                (void) printf(gettext(" status: The pool was last accessed by "
                    "another system.\n"));
@@ -1463,19 +1572,21 @@ show_import(nvlist_t *config)
         * Print out an action according to the overall state of the pool.
         */
        if (vs->vs_state == VDEV_STATE_HEALTHY) {
-               if (reason == ZPOOL_STATUS_VERSION_OLDER)
+               if (reason == ZPOOL_STATUS_VERSION_OLDER ||
+                   reason == ZPOOL_STATUS_FEAT_DISABLED) {
                        (void) printf(gettext(" action: The pool can be "
                            "imported using its name or numeric identifier, "
                            "though\n\tsome features will not be available "
                            "without an explicit 'zpool upgrade'.\n"));
-               else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH)
+               } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
                        (void) printf(gettext(" action: The pool can be "
                            "imported using its name or numeric "
                            "identifier and\n\tthe '-f' flag.\n"));
-               else
+               } else {
                        (void) printf(gettext(" action: The pool can be "
                            "imported using its name or numeric "
                            "identifier.\n"));
+               }
        } else if (vs->vs_state == VDEV_STATE_DEGRADED) {
                (void) printf(gettext(" action: The pool can be imported "
                    "despite missing or damaged devices.  The\n\tfault "
@@ -1488,6 +1599,20 @@ show_import(nvlist_t *config)
                            "newer\n\tsoftware, or recreate the pool from "
                            "backup.\n"));
                        break;
+               case ZPOOL_STATUS_UNSUP_FEAT_READ:
+                       (void) printf(gettext("action: The pool cannot be "
+                           "imported. Access the pool on a system that "
+                           "supports\n\tthe required feature(s), or recreate "
+                           "the pool from backup.\n"));
+                       break;
+               case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+                       (void) printf(gettext("action: The pool cannot be "
+                           "imported in read-write mode. Import the pool "
+                           "with\n"
+                           "\t\"-o readonly=on\", access the pool on a system "
+                           "that supports the\n\trequired feature(s), or "
+                           "recreate the pool from backup.\n"));
+                       break;
                case ZPOOL_STATUS_MISSING_DEV_R:
                case ZPOOL_STATUS_MISSING_DEV_NR:
                case ZPOOL_STATUS_BAD_GUID_SUM:
@@ -1563,9 +1688,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
            ZPOOL_CONFIG_POOL_STATE, &state) == 0);
        verify(nvlist_lookup_uint64(config,
            ZPOOL_CONFIG_VERSION, &version) == 0);
-       if (version > SPA_VERSION) {
+       if (!SPA_VERSION_IS_SUPPORTED(version)) {
                (void) fprintf(stderr, gettext("cannot import '%s': pool "
-                   "is formatted using a newer ZFS version\n"), name);
+                   "is formatted using an unsupported ZFS version\n"), name);
                return (1);
        } else if (state != POOL_STATE_EXPORTED &&
            !(flags & ZFS_IMPORT_ANY_HOST)) {
@@ -2556,15 +2681,13 @@ static void
 print_header(list_cbdata_t *cb)
 {
        zprop_list_t *pl = cb->cb_proplist;
+       char headerbuf[ZPOOL_MAXPROPLEN];
        const char *header;
        boolean_t first = B_TRUE;
        boolean_t right_justify;
        size_t width = 0;
 
        for (; pl != NULL; pl = pl->pl_next) {
-               if (pl->pl_prop == ZPROP_INVAL)
-                       continue;
-
                width = pl->pl_width;
                if (first && cb->cb_verbose) {
                        /*
@@ -2579,8 +2702,18 @@ print_header(list_cbdata_t *cb)
                else
                        first = B_FALSE;
 
-               header = zpool_prop_column_name(pl->pl_prop);
-               right_justify = zpool_prop_align_right(pl->pl_prop);
+               right_justify = B_FALSE;
+               if (pl->pl_prop != ZPROP_INVAL) {
+                       header = zpool_prop_column_name(pl->pl_prop);
+                       right_justify = zpool_prop_align_right(pl->pl_prop);
+               } else {
+                       int i;
+
+                       for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+                               headerbuf[i] = toupper(pl->pl_user_prop[i]);
+                       headerbuf[i] = '\0';
+                       header = headerbuf;
+               }
 
                if (pl->pl_next == NULL && !right_justify)
                        (void) printf("%s", header);
@@ -2639,6 +2772,11 @@ print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
                                propstr = property;
 
                        right_justify = zpool_prop_align_right(pl->pl_prop);
+               } else if ((zpool_prop_feature(pl->pl_user_prop) ||
+                   zpool_prop_unsupported(pl->pl_user_prop)) &&
+                   zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
+                   sizeof (property)) == 0) {
+                       propstr = property;
                } else {
                        propstr = "-";
                }
@@ -3941,12 +4079,13 @@ status_callback(zpool_handle_t *zhp, void *data)
                break;
 
        case ZPOOL_STATUS_VERSION_OLDER:
-               (void) printf(gettext("status: The pool is formatted using an "
-                   "older on-disk format.  The pool can\n\tstill be used, but "
-                   "some features are unavailable.\n"));
+               (void) printf(gettext("status: The pool is formatted using a "
+                   "legacy on-disk format.  The pool can\n\tstill be used, "
+                   "but some features are unavailable.\n"));
                (void) printf(gettext("action: Upgrade the pool using 'zpool "
                    "upgrade'.  Once this is done, the\n\tpool will no longer "
-                   "be accessible on older software versions.\n"));
+                   "be accessible on software that does not support feature\n"
+                   "\tflags.\n"));
                break;
 
        case ZPOOL_STATUS_VERSION_NEWER:
@@ -3958,6 +4097,41 @@ status_callback(zpool_handle_t *zhp, void *data)
                    "backup.\n"));
                break;
 
+       case ZPOOL_STATUS_FEAT_DISABLED:
+               (void) printf(gettext("status: Some supported features are not "
+                   "enabled on the pool. The pool can\n\tstill be used, but "
+                   "some features are unavailable.\n"));
+               (void) printf(gettext("action: Enable all features using "
+                   "'zpool upgrade'. Once this is done,\n\tthe pool may no "
+                   "longer be accessible by software that does not support\n\t"
+                   "the features. See zpool-features(5) for details.\n"));
+               break;
+
+       case ZPOOL_STATUS_UNSUP_FEAT_READ:
+               (void) printf(gettext("status: The pool cannot be accessed on "
+                   "this system because it uses the\n\tfollowing feature(s) "
+                   "not supported on this system:\n"));
+               zpool_print_unsup_feat(config);
+               (void) printf("\n");
+               (void) printf(gettext("action: Access the pool from a system "
+                   "that supports the required feature(s),\n\tor restore the "
+                   "pool from backup.\n"));
+               break;
+
+       case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+               (void) printf(gettext("status: The pool can only be accessed "
+                   "in read-only mode on this system. It\n\tcannot be "
+                   "accessed in read-write mode because it uses the "
+                   "following\n\tfeature(s) not supported on this system:\n"));
+               zpool_print_unsup_feat(config);
+               (void) printf("\n");
+               (void) printf(gettext("action: The pool cannot be accessed in "
+                   "read-write mode. Import the pool with\n"
+                   "\t\"-o readonly=on\", access the pool from a system that "
+                   "supports the\n\trequired feature(s), or restore the "
+                   "pool from backup.\n"));
+               break;
+
        case ZPOOL_STATUS_FAULTED_DEV_R:
                (void) printf(gettext("status: One or more devices are "
                    "faulted in response to persistent errors.\n\tSufficient "
@@ -4162,56 +4336,162 @@ zpool_do_status(int argc, char **argv)
 }
 
 typedef struct upgrade_cbdata {
-       int     cb_all;
        int     cb_first;
-       int     cb_newer;
        int     cb_argc;
        uint64_t cb_version;
        char    **cb_argv;
 } upgrade_cbdata_t;
 
 static int
+upgrade_version(zpool_handle_t *zhp, uint64_t version)
+{
+       int ret;
+       nvlist_t *config;
+       uint64_t oldversion;
+
+       config = zpool_get_config(zhp, NULL);
+       verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+           &oldversion) == 0);
+
+       assert(SPA_VERSION_IS_SUPPORTED(oldversion));
+       assert(oldversion < version);
+
+       ret = zpool_upgrade(zhp, version);
+       if (ret != 0)
+               return (ret);
+
+       if (version >= SPA_VERSION_FEATURES) {
+               (void) printf(gettext("Successfully upgraded "
+                   "'%s' from version %llu to feature flags.\n"),
+                   zpool_get_name(zhp), (u_longlong_t) oldversion);
+       } else {
+               (void) printf(gettext("Successfully upgraded "
+                   "'%s' from version %llu to version %llu.\n"),
+                   zpool_get_name(zhp), (u_longlong_t) oldversion,
+                   (u_longlong_t) version);
+       }
+
+       return (0);
+}
+
+static int
+upgrade_enable_all(zpool_handle_t *zhp, int *countp)
+{
+       int i, ret, count;
+       boolean_t firstff = B_TRUE;
+       nvlist_t *enabled = zpool_get_features(zhp);
+
+       count = 0;
+       for (i = 0; i < SPA_FEATURES; i++) {
+               const char *fname = spa_feature_table[i].fi_uname;
+               const char *fguid = spa_feature_table[i].fi_guid;
+               if (!nvlist_exists(enabled, fguid)) {
+                       char *propname;
+                       verify(-1 != asprintf(&propname, "feature@%s", fname));
+                       ret = zpool_set_prop(zhp, propname,
+                           ZFS_FEATURE_ENABLED);
+                       if (ret != 0) {
+                               free(propname);
+                               return (ret);
+                       }
+                       count++;
+
+                       if (firstff) {
+                               (void) printf(gettext("Enabled the "
+                                   "following features on '%s':\n"),
+                                   zpool_get_name(zhp));
+                               firstff = B_FALSE;
+                       }
+                       (void) printf(gettext("  %s\n"), fname);
+                       free(propname);
+               }
+       }
+
+       if (countp != NULL)
+               *countp = count;
+       return (0);
+}
+
+static int
 upgrade_cb(zpool_handle_t *zhp, void *arg)
 {
        upgrade_cbdata_t *cbp = arg;
        nvlist_t *config;
        uint64_t version;
-       int ret = 0;
+       boolean_t printnl = B_FALSE;
+       int ret;
 
        config = zpool_get_config(zhp, NULL);
        verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
            &version) == 0);
 
-       if (!cbp->cb_newer && version < SPA_VERSION) {
-               if (!cbp->cb_all) {
-                       if (cbp->cb_first) {
-                               (void) printf(gettext("The following pools are "
-                                   "out of date, and can be upgraded.  After "
-                                   "being\nupgraded, these pools will no "
-                                   "longer be accessible by older software "
-                                   "versions.\n\n"));
-                               (void) printf(gettext("VER  POOL\n"));
-                               (void) printf(gettext("---  ------------\n"));
-                               cbp->cb_first = B_FALSE;
-                       }
+       assert(SPA_VERSION_IS_SUPPORTED(version));
 
-                       (void) printf("%2llu   %s\n", (u_longlong_t)version,
-                           zpool_get_name(zhp));
-               } else {
+       if (version < cbp->cb_version) {
+               cbp->cb_first = B_FALSE;
+               ret = upgrade_version(zhp, cbp->cb_version);
+               if (ret != 0)
+                       return (ret);
+               printnl = B_TRUE;
+
+#if 0
+               /*
+                * XXX: This code can be enabled when Illumos commit
+                * 4445fffbbb1ea25fd0e9ea68b9380dd7a6709025 is merged.
+                * It reworks the history logging among other things.
+                */
+
+               /*
+                * If they did "zpool upgrade -a", then we could
+                * be doing ioctls to different pools.  We need
+                * to log this history once to each pool, and bypass
+                * the normal history logging that happens in main().
+                */
+               (void) zpool_log_history(g_zfs, history_str);
+               log_history = B_FALSE;
+#endif
+       }
+
+       if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+               int count;
+               ret = upgrade_enable_all(zhp, &count);
+               if (ret != 0)
+                       return (ret);
+
+               if (count > 0) {
                        cbp->cb_first = B_FALSE;
-                       ret = zpool_upgrade(zhp, cbp->cb_version);
-                       if (!ret) {
-                               (void) printf(gettext("Successfully upgraded "
-                                   "'%s'\n\n"), zpool_get_name(zhp));
-                       }
+                       printnl = B_TRUE;
                }
-       } else if (cbp->cb_newer && version > SPA_VERSION) {
-               assert(!cbp->cb_all);
+       }
+
+       if (printnl) {
+               (void) printf(gettext("\n"));
+       }
+
+       return (0);
+}
+
+static int
+upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
+{
+       upgrade_cbdata_t *cbp = arg;
+       nvlist_t *config;
+       uint64_t version;
+
+       config = zpool_get_config(zhp, NULL);
+       verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+           &version) == 0);
+
+       assert(SPA_VERSION_IS_SUPPORTED(version));
 
+       if (version < SPA_VERSION_FEATURES) {
                if (cbp->cb_first) {
                        (void) printf(gettext("The following pools are "
-                           "formatted using a newer software version and\n"
-                           "cannot be accessed on the current system.\n\n"));
+                           "formatted with legacy version numbers and can\n"
+                           "be upgraded to use feature flags.  After "
+                           "being upgraded, these pools\nwill no "
+                           "longer be accessible by software that does not "
+                           "support feature\nflags.\n\n"));
                        (void) printf(gettext("VER  POOL\n"));
                        (void) printf(gettext("---  ------------\n"));
                        cbp->cb_first = B_FALSE;
@@ -4221,14 +4501,65 @@ upgrade_cb(zpool_handle_t *zhp, void *arg)
                    zpool_get_name(zhp));
        }
 
-       zpool_close(zhp);
-       return (ret);
+       return (0);
+}
+
+static int
+upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
+{
+       upgrade_cbdata_t *cbp = arg;
+       nvlist_t *config;
+       uint64_t version;
+
+       config = zpool_get_config(zhp, NULL);
+       verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+           &version) == 0);
+
+       if (version >= SPA_VERSION_FEATURES) {
+               int i;
+               boolean_t poolfirst = B_TRUE;
+               nvlist_t *enabled = zpool_get_features(zhp);
+
+               for (i = 0; i < SPA_FEATURES; i++) {
+                       const char *fguid = spa_feature_table[i].fi_guid;
+                       const char *fname = spa_feature_table[i].fi_uname;
+                       if (!nvlist_exists(enabled, fguid)) {
+                               if (cbp->cb_first) {
+                                       (void) printf(gettext("\nSome "
+                                           "supported features are not "
+                                           "enabled on the following pools. "
+                                           "Once a\nfeature is enabled the "
+                                           "pool may become incompatible with "
+                                           "software\nthat does not support "
+                                           "the feature. See "
+                                           "zpool-features(5) for "
+                                           "details.\n\n"));
+                                       (void) printf(gettext("POOL  "
+                                           "FEATURE\n"));
+                                       (void) printf(gettext("------"
+                                           "---------\n"));
+                                       cbp->cb_first = B_FALSE;
+                               }
+
+                               if (poolfirst) {
+                                       (void) printf(gettext("%s\n"),
+                                           zpool_get_name(zhp));
+                                       poolfirst = B_FALSE;
+                               }
+
+                               (void) printf(gettext("      %s\n"), fname);
+                       }
+               }
+       }
+
+       return (0);
 }
 
 /* ARGSUSED */
 static int
 upgrade_one(zpool_handle_t *zhp, void *data)
 {
+       boolean_t printnl = B_FALSE;
        upgrade_cbdata_t *cbp = data;
        uint64_t cur_version;
        int ret;
@@ -4243,26 +4574,45 @@ upgrade_one(zpool_handle_t *zhp, void *data)
        cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
        if (cur_version > cbp->cb_version) {
                (void) printf(gettext("Pool '%s' is already formatted "
-                   "using more current version '%llu'.\n"),
+                   "using more current version '%llu'.\n\n"),
                    zpool_get_name(zhp), (u_longlong_t) cur_version);
                return (0);
        }
-       if (cur_version == cbp->cb_version) {
+
+       if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
                (void) printf(gettext("Pool '%s' is already formatted "
-                   "using the current version.\n"), zpool_get_name(zhp));
+                   "using version %llu.\n\n"), zpool_get_name(zhp),
+                   (u_longlong_t) cbp->cb_version);
                return (0);
        }
 
-       ret = zpool_upgrade(zhp, cbp->cb_version);
+       if (cur_version != cbp->cb_version) {
+               printnl = B_TRUE;
+               ret = upgrade_version(zhp, cbp->cb_version);
+               if (ret != 0)
+                       return (ret);
+       }
 
-       if (!ret) {
-               (void) printf(gettext("Successfully upgraded '%s' "
-                   "from version %llu to version %llu\n\n"),
-                   zpool_get_name(zhp), (u_longlong_t)cur_version,
-                   (u_longlong_t)cbp->cb_version);
+       if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+               int count = 0;
+               ret = upgrade_enable_all(zhp, &count);
+               if (ret != 0)
+                       return (ret);
+
+               if (count != 0) {
+                       printnl = B_TRUE;
+               } else if (cur_version == SPA_VERSION) {
+                       (void) printf(gettext("Pool '%s' already has all "
+                           "supported features enabled.\n"),
+                           zpool_get_name(zhp));
+               }
        }
 
-       return (ret != 0);
+       if (printnl) {
+               (void) printf(gettext("\n"));
+       }
+
+       return (0);
 }
 
 /*
@@ -4281,6 +4631,7 @@ zpool_do_upgrade(int argc, char **argv)
        upgrade_cbdata_t cb = { 0 };
        int ret = 0;
        boolean_t showversions = B_FALSE;
+       boolean_t upgradeall = B_FALSE;
        char *end;
 
 
@@ -4288,15 +4639,15 @@ zpool_do_upgrade(int argc, char **argv)
        while ((c = getopt(argc, argv, ":avV:")) != -1) {
                switch (c) {
                case 'a':
-                       cb.cb_all = B_TRUE;
+                       upgradeall = B_TRUE;
                        break;
                case 'v':
                        showversions = B_TRUE;
                        break;
                case 'V':
                        cb.cb_version = strtoll(optarg, &end, 10);
-                       if (*end != '\0' || cb.cb_version > SPA_VERSION ||
-                           cb.cb_version < SPA_VERSION_1) {
+                       if (*end != '\0' ||
+                           !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
                                (void) fprintf(stderr,
                                    gettext("invalid version '%s'\n"), optarg);
                                usage(B_FALSE);
@@ -4321,19 +4672,19 @@ zpool_do_upgrade(int argc, char **argv)
 
        if (cb.cb_version == 0) {
                cb.cb_version = SPA_VERSION;
-       } else if (!cb.cb_all && argc == 0) {
+       } else if (!upgradeall && argc == 0) {
                (void) fprintf(stderr, gettext("-V option is "
                    "incompatible with other arguments\n"));
                usage(B_FALSE);
        }
 
        if (showversions) {
-               if (cb.cb_all || argc != 0) {
+               if (upgradeall || argc != 0) {
                        (void) fprintf(stderr, gettext("-v option is "
                            "incompatible with other arguments\n"));
                        usage(B_FALSE);
                }
-       } else if (cb.cb_all) {
+       } else if (upgradeall) {
                if (argc != 0) {
                        (void) fprintf(stderr, gettext("-a option should not "
                            "be used along with a pool name\n"));
@@ -4341,11 +4692,27 @@ zpool_do_upgrade(int argc, char **argv)
                }
        }
 
-       (void) printf(gettext("This system is currently running "
-           "ZFS pool version %llu.\n\n"), SPA_VERSION);
-       cb.cb_first = B_TRUE;
+       (void) printf(gettext("This system supports ZFS pool feature "
+           "flags.\n\n"));
        if (showversions) {
-               (void) printf(gettext("The following versions are "
+               int i;
+
+               (void) printf(gettext("The following features are "
+                   "supported:\n\n"));
+               (void) printf(gettext("FEAT DESCRIPTION\n"));
+               (void) printf("----------------------------------------------"
+                   "---------------\n");
+               for (i = 0; i < SPA_FEATURES; i++) {
+                       zfeature_info_t *fi = &spa_feature_table[i];
+                       const char *ro = fi->fi_can_readonly ?
+                           " (read-only compatible)" : "";
+
+                       (void) printf("%-37s%s\n", fi->fi_uname, ro);
+                       (void) printf("     %s\n", fi->fi_desc);
+               }
+               (void) printf("\n");
+
+               (void) printf(gettext("The following legacy versions are also "
                    "supported:\n\n"));
                (void) printf(gettext("VER  DESCRIPTION\n"));
                (void) printf("---  -----------------------------------------"
@@ -4388,32 +4755,44 @@ zpool_do_upgrade(int argc, char **argv)
                (void) printf(gettext("\nFor more information on a particular "
                    "version, including supported releases,\n"));
                (void) printf(gettext("see the ZFS Administration Guide.\n\n"));
-       } else if (argc == 0) {
-               int notfound;
-
+       } else if (argc == 0 && upgradeall) {
+               cb.cb_first = B_TRUE;
                ret = zpool_iter(g_zfs, upgrade_cb, &cb);
-               notfound = cb.cb_first;
-
-               if (!cb.cb_all && ret == 0) {
-                       if (!cb.cb_first)
-                               (void) printf("\n");
-                       cb.cb_first = B_TRUE;
-                       cb.cb_newer = B_TRUE;
-                       ret = zpool_iter(g_zfs, upgrade_cb, &cb);
-                       if (!cb.cb_first) {
-                               notfound = B_FALSE;
-                               (void) printf("\n");
+               if (ret == 0 && cb.cb_first) {
+                       if (cb.cb_version == SPA_VERSION) {
+                               (void) printf(gettext("All pools are already "
+                                   "formatted using feature flags.\n\n"));
+                               (void) printf(gettext("Every feature flags "
+                                   "pool already has all supported features "
+                                   "enabled.\n"));
+                       } else {
+                               (void) printf(gettext("All pools are already "
+                                   "formatted with version %llu or higher.\n"),
+                                   (u_longlong_t) cb.cb_version);
                        }
                }
+       } else if (argc == 0) {
+               cb.cb_first = B_TRUE;
+               ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
+               assert(ret == 0);
+
+               if (cb.cb_first) {
+                       (void) printf(gettext("All pools are formatted "
+                           "using feature flags.\n\n"));
+               } else {
+                       (void) printf(gettext("\nUse 'zpool upgrade -v' "
+                           "for a list of available legacy versions.\n"));
+               }
+
+               cb.cb_first = B_TRUE;
+               ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
+               assert(ret == 0);
 
-               if (ret == 0) {
-                       if (notfound)
-                               (void) printf(gettext("All pools are formatted "
-                                   "using this version.\n"));
-                       else if (!cb.cb_all)
-                               (void) printf(gettext("Use 'zpool upgrade -v' "
-                                   "for a list of available versions and "
-                                   "their associated\nfeatures.\n"));
+               if (cb.cb_first) {
+                       (void) printf(gettext("Every feature flags pool has "
+                           "all supported features enabled.\n"));
+               } else {
+                       (void) printf(gettext("\n"));
                }
        } else {
                ret = for_each_pool(argc, argv, B_FALSE, NULL,
@@ -4923,13 +5302,26 @@ get_callback(zpool_handle_t *zhp, void *data)
                    pl == cbp->cb_proplist)
                        continue;
 
-               if (zpool_get_prop(zhp, pl->pl_prop,
-                   value, sizeof (value), &srctype) != 0)
-                       continue;
+               if (pl->pl_prop == ZPROP_INVAL &&
+                   (zpool_prop_feature(pl->pl_user_prop) ||
+                   zpool_prop_unsupported(pl->pl_user_prop))) {
+                       srctype = ZPROP_SRC_LOCAL;
+
+                       if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
+                           value, sizeof (value)) == 0) {
+                               zprop_print_one_property(zpool_get_name(zhp),
+                                   cbp, pl->pl_user_prop, value, srctype,
+                                   NULL, NULL);
+                       }
+               } else {
+                       if (zpool_get_prop(zhp, pl->pl_prop, value,
+                           sizeof (value), &srctype) != 0)
+                               continue;
 
-               zprop_print_one_property(zpool_get_name(zhp), cbp,
-                   zpool_prop_to_name(pl->pl_prop), value, srctype, NULL,
-                   NULL);
+                       zprop_print_one_property(zpool_get_name(zhp), cbp,
+                           zpool_prop_to_name(pl->pl_prop), value, srctype,
+                           NULL, NULL);
+               }
        }
        return (0);
 }
@@ -4941,8 +5333,11 @@ zpool_do_get(int argc, char **argv)
        zprop_list_t fake_name = { 0 };
        int ret;
 
-       if (argc < 3)
+       if (argc < 2) {
+               (void) fprintf(stderr, gettext("missing property "
+                   "argument\n"));
                usage(B_FALSE);
+       }
 
        cb.cb_first = B_TRUE;
        cb.cb_sources = ZPROP_SRC_ALL;
@@ -4952,7 +5347,7 @@ zpool_do_get(int argc, char **argv)
        cb.cb_columns[3] = GET_COL_SOURCE;
        cb.cb_type = ZFS_TYPE_POOL;
 
-       if (zprop_get_list(g_zfs, argv[1],  &cb.cb_proplist,
+       if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist,
            ZFS_TYPE_POOL) != 0)
                usage(B_FALSE);
 
index 7e941b5..07b81cc 100644 (file)
 #include <sys/dsl_scan.h>
 #include <sys/zio_checksum.h>
 #include <sys/refcount.h>
+#include <sys/zfeature.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
@@ -330,6 +331,7 @@ ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
 ztest_func_t ztest_split_pool;
 ztest_func_t ztest_reguid;
+ztest_func_t ztest_spa_upgrade;
 
 uint64_t zopt_always = 0ULL * NANOSEC;         /* all the time */
 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@@ -360,17 +362,12 @@ ztest_info_t ztest_info[] = {
        { ztest_fault_inject,                   1,      &zopt_sometimes },
        { ztest_ddt_repair,                     1,      &zopt_sometimes },
        { ztest_dmu_snapshot_hold,              1,      &zopt_sometimes },
-       /*
-        * The reguid test is currently broken. Disable it until
-        * we get around to fixing it.
-        */
-#if 0
        { ztest_reguid,                         1,      &zopt_sometimes },
-#endif
        { ztest_spa_rename,                     1,      &zopt_rarely    },
        { ztest_scrub,                          1,      &zopt_rarely    },
+       { ztest_spa_upgrade,                    1,      &zopt_rarely    },
        { ztest_dsl_dataset_promote_busy,       1,      &zopt_rarely    },
-       { ztest_vdev_attach_detach,             1,      &zopt_rarely },
+       { ztest_vdev_attach_detach,             1,      &zopt_rarely    },
        { ztest_vdev_LUN_growth,                1,      &zopt_rarely    },
        { ztest_vdev_add_remove,                1,
            &ztest_opts.zo_vdevtime                             },
@@ -421,6 +418,13 @@ static spa_t *ztest_spa = NULL;
 static ztest_ds_t *ztest_ds;
 
 static kmutex_t ztest_vdev_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
 static krwlock_t ztest_name_lock;
 
 static boolean_t ztest_dump_core = B_TRUE;
@@ -816,7 +820,7 @@ ztest_get_ashift(void)
 }
 
 static nvlist_t *
-make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
+make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
 {
        char *pathbuf;
        uint64_t vdev;
@@ -834,12 +838,13 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
                        vdev = ztest_shared->zs_vdev_aux;
                        (void) snprintf(path, MAXPATHLEN,
                            ztest_aux_template, ztest_opts.zo_dir,
-                           ztest_opts.zo_pool, aux, vdev);
+                           pool == NULL ? ztest_opts.zo_pool : pool,
+                           aux, vdev);
                } else {
                        vdev = ztest_shared->zs_vdev_next_leaf++;
                        (void) snprintf(path, MAXPATHLEN,
                            ztest_dev_template, ztest_opts.zo_dir,
-                           ztest_opts.zo_pool, vdev);
+                           pool == NULL ? ztest_opts.zo_pool : pool, vdev);
                }
        }
 
@@ -862,17 +867,18 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
 }
 
 static nvlist_t *
-make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r)
+make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r)
 {
        nvlist_t *raidz, **child;
        int c;
 
        if (r < 2)
-               return (make_vdev_file(path, aux, size, ashift));
+               return (make_vdev_file(path, aux, pool, size, ashift));
        child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
 
        for (c = 0; c < r; c++)
-               child[c] = make_vdev_file(path, aux, size, ashift);
+               child[c] = make_vdev_file(path, aux, pool, size, ashift);
 
        VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
        VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
@@ -891,19 +897,19 @@ make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r)
 }
 
 static nvlist_t *
-make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift,
-       int r, int m)
+make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r, int m)
 {
        nvlist_t *mirror, **child;
        int c;
 
        if (m < 1)
-               return (make_vdev_raidz(path, aux, size, ashift, r));
+               return (make_vdev_raidz(path, aux, pool, size, ashift, r));
 
        child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
 
        for (c = 0; c < m; c++)
-               child[c] = make_vdev_raidz(path, aux, size, ashift, r);
+               child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
 
        VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
        VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
@@ -920,8 +926,8 @@ make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift,
 }
 
 static nvlist_t *
-make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
-       int log, int r, int m, int t)
+make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
+    int log, int r, int m, int t)
 {
        nvlist_t *root, **child;
        int c;
@@ -931,7 +937,8 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
        child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
 
        for (c = 0; c < t; c++) {
-               child[c] = make_vdev_mirror(path, aux, size, ashift, r, m);
+               child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
+                   r, m);
                VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
                    log) == 0);
        }
@@ -949,6 +956,27 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
        return (root);
 }
 
+/*
+ * Find a random spa version. Returns back a random spa version in the
+ * range [initial_version, SPA_VERSION_FEATURES].
+ */
+static uint64_t
+ztest_random_spa_version(uint64_t initial_version)
+{
+       uint64_t version = initial_version;
+
+       if (version <= SPA_VERSION_BEFORE_FEATURES) {
+               version = version +
+                   ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
+       }
+
+       if (version > SPA_VERSION_BEFORE_FEATURES)
+               version = SPA_VERSION_FEATURES;
+
+       ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+       return (version);
+}
+
 static int
 ztest_random_blocksize(void)
 {
@@ -2275,6 +2303,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
 {
        objset_t *os = zd->zd_os;
 
+       mutex_enter(&zd->zd_dirobj_lock);
        (void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
 
        /* zfs_sb_teardown() */
@@ -2285,6 +2314,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
        zil_replay(os, zd, ztest_replay_vector);
 
        (void) rw_exit(&zd->zd_zilog_lock);
+       mutex_exit(&zd->zd_dirobj_lock);
 }
 
 /*
@@ -2302,7 +2332,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
        /*
         * Attempt to create using a bad file.
         */
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+       nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
        VERIFY3U(ENOENT, ==,
            spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
        nvlist_free(nvroot);
@@ -2310,7 +2340,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
        /*
         * Attempt to create using a bad mirror.
         */
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+       nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
        VERIFY3U(ENOENT, ==,
            spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
        nvlist_free(nvroot);
@@ -2320,7 +2350,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
         * what's in the nvroot; we should fail with EEXIST.
         */
        (void) rw_enter(&ztest_name_lock, RW_READER);
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+       nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
        VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL));
        nvlist_free(nvroot);
        VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
@@ -2330,6 +2360,78 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
        (void) rw_exit(&ztest_name_lock);
 }
 
+/* ARGSUSED */
+void
+ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
+{
+       spa_t *spa;
+       uint64_t initial_version = SPA_VERSION_INITIAL;
+       uint64_t version, newversion;
+       nvlist_t *nvroot, *props;
+       char *name;
+
+       mutex_enter(&ztest_vdev_lock);
+       name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
+
+       /*
+        * Clean up from previous runs.
+        */
+       (void) spa_destroy(name);
+
+       nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
+           0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+
+       /*
+        * If we're configuring a RAIDZ device then make sure that the
+        * the initial version is capable of supporting that feature.
+        */
+       switch (ztest_opts.zo_raidz_parity) {
+       case 0:
+       case 1:
+               initial_version = SPA_VERSION_INITIAL;
+               break;
+       case 2:
+               initial_version = SPA_VERSION_RAIDZ2;
+               break;
+       case 3:
+               initial_version = SPA_VERSION_RAIDZ3;
+               break;
+       }
+
+       /*
+        * Create a pool with a spa version that can be upgraded. Pick
+        * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
+        */
+       do {
+               version = ztest_random_spa_version(initial_version);
+       } while (version > SPA_VERSION_BEFORE_FEATURES);
+
+       props = fnvlist_alloc();
+       fnvlist_add_uint64(props,
+           zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
+       VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0);
+       fnvlist_free(nvroot);
+       fnvlist_free(props);
+
+       VERIFY3S(spa_open(name, &spa, FTAG), ==, 0);
+       VERIFY3U(spa_version(spa), ==, version);
+       newversion = ztest_random_spa_version(version + 1);
+
+       if (ztest_opts.zo_verbose >= 4) {
+               (void) printf("upgrading spa version from %llu to %llu\n",
+                   (u_longlong_t)version, (u_longlong_t)newversion);
+       }
+
+       spa_upgrade(spa, newversion);
+       VERIFY3U(spa_version(spa), >, version);
+       VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
+           zpool_prop_to_name(ZPOOL_PROP_VERSION)));
+       spa_close(spa, FTAG);
+
+       strfree(name);
+       mutex_exit(&ztest_vdev_lock);
+}
+
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
@@ -2420,7 +2522,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
                /*
                 * Make 1/4 of the devices be log devices.
                 */
-               nvroot = make_vdev_root(NULL, NULL,
+               nvroot = make_vdev_root(NULL, NULL, NULL,
                    ztest_opts.zo_vdev_size, 0,
                    ztest_random(4) == 0, ztest_opts.zo_raidz,
                    zs->zs_mirrors, 1);
@@ -2499,7 +2601,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
                /*
                 * Add a new device.
                 */
-               nvlist_t *nvroot = make_vdev_root(NULL, aux,
+               nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
                    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
                error = spa_vdev_add(spa, nvroot);
                if (error != 0)
@@ -2772,7 +2874,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
        /*
         * Build the nvlist describing newpath.
         */
-       root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0,
+       root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
            ashift, 0, 0, 0, 1);
 
        error = spa_vdev_attach(spa, oldguid, root, replacing);
@@ -4863,7 +4965,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
                        if (islog)
                                (void) rw_exit(&ztest_name_lock);
                } else {
+                       /*
+                        * Ideally we would like to be able to randomly
+                        * call vdev_[on|off]line without holding locks
+                        * to force unpredictable failures but the side
+                        * effects of vdev_[on|off]line prevent us from
+                        * doing so. We grab the ztest_vdev_lock here to
+                        * prevent a race between injection testing and
+                        * aux_vdev removal.
+                        */
+                       mutex_enter(&ztest_vdev_lock);
                        (void) vdev_online(spa, guid0, 0, NULL);
+                       mutex_exit(&ztest_vdev_lock);
                }
        }
 
@@ -5039,13 +5152,19 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
 {
        spa_t *spa = ztest_spa;
        uint64_t orig, load;
+       int error;
 
        orig = spa_guid(spa);
        load = spa_load_guid(spa);
-       if (spa_change_guid(spa) != 0)
+
+       (void) rw_enter(&ztest_name_lock, RW_WRITER);
+       error = spa_change_guid(spa);
+       (void) rw_exit(&ztest_name_lock);
+
+       if (error != 0)
                return;
 
-       if (ztest_opts.zo_verbose >= 3) {
+       if (ztest_opts.zo_verbose >= 4) {
                (void) printf("Changed guid old %llu -> %llu\n",
                    (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
        }
@@ -5735,8 +5854,15 @@ ztest_freeze(void)
         */
        kernel_init(FREAD | FWRITE);
        VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+       ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
        VERIFY3U(0, ==, ztest_dataset_open(0));
        ztest_dataset_close(0);
+
+       spa->spa_debug = B_TRUE;
+       ztest_spa = spa;
+       txg_wait_synced(spa_get_dsl(spa), 0);
+       ztest_reguid(NULL, 0);
+
        spa_close(spa, FTAG);
        kernel_fini();
 }
@@ -5771,10 +5897,9 @@ make_random_props(void)
 {
        nvlist_t *props;
 
-       if (ztest_random(2) == 0)
-               return (NULL);
-
        VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+       if (ztest_random(2) == 0)
+               return (props);
        VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
 
        return (props);
@@ -5789,6 +5914,7 @@ ztest_init(ztest_shared_t *zs)
 {
        spa_t *spa;
        nvlist_t *nvroot, *props;
+       int i;
 
        mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
        rw_init(&ztest_name_lock, NULL, RW_DEFAULT, NULL);
@@ -5802,9 +5928,16 @@ ztest_init(ztest_shared_t *zs)
        ztest_shared->zs_vdev_next_leaf = 0;
        zs->zs_splits = 0;
        zs->zs_mirrors = ztest_opts.zo_mirrors;
-       nvroot = make_vdev_root(NULL, NULL, ztest_opts.zo_vdev_size, 0,
+       nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
            0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
        props = make_random_props();
+       for (i = 0; i < SPA_FEATURES; i++) {
+               char *buf;
+               VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
+                   spa_feature_table[i].fi_uname));
+               VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
+               free(buf);
+       }
        VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props,
            NULL, NULL));
        nvlist_free(nvroot);
index 2b79f5f..fc38f2c 100644 (file)
@@ -89,6 +89,7 @@ AC_CONFIG_FILES([
        lib/libshare/Makefile
        cmd/Makefile
        cmd/zdb/Makefile
+       cmd/zhack/Makefile
        cmd/zfs/Makefile
        cmd/zinject/Makefile
        cmd/zpool/Makefile
index 8f9c8d7..8325a8e 100644 (file)
@@ -1,6 +1,7 @@
 SUBDIRS = linux sys
 
 COMMON_H = \
+       $(top_srcdir)/include/zfeature_common.h \
        $(top_srcdir)/include/zfs_comutil.h \
        $(top_srcdir)/include/zfs_deleg.h \
        $(top_srcdir)/include/zfs_fletcher.h \
index e59350c..08f3d9e 100644 (file)
@@ -301,6 +301,15 @@ typedef enum {
        ZPOOL_STATUS_BAD_LOG,           /* cannot read log chain(s) */
 
        /*
+        * If the pool has unsupported features but can still be opened in
+        * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
+        * pool has unsupported features but cannot be opened at all, its
+        * status is ZPOOL_STATUS_UNSUP_FEAT_READ.
+        */
+       ZPOOL_STATUS_UNSUP_FEAT_READ,   /* unsupported features for read */
+       ZPOOL_STATUS_UNSUP_FEAT_WRITE,  /* unsupported features for write */
+
+       /*
         * These faults have no corresponding message ID.  At the time we are
         * checking the status, the original reason for the FMA fault (I/O or
         * checksum errors) has been lost.
@@ -313,7 +322,8 @@ typedef enum {
         * requiring administrative attention.  There is no corresponding
         * message ID.
         */
-       ZPOOL_STATUS_VERSION_OLDER,     /* older on-disk version */
+       ZPOOL_STATUS_VERSION_OLDER,     /* older legacy on-disk version */
+       ZPOOL_STATUS_FEAT_DISABLED,     /* supported features are disabled */
        ZPOOL_STATUS_RESILVERING,       /* device being resilvered */
        ZPOOL_STATUS_OFFLINE_DEV,       /* device online */
        ZPOOL_STATUS_REMOVED_DEV,       /* removed device */
@@ -332,6 +342,7 @@ extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
  * Statistics and configuration functions.
  */
 extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
+extern nvlist_t *zpool_get_features(zpool_handle_t *);
 extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
 extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
 
@@ -344,6 +355,7 @@ extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     char *altroot);
 extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
     nvlist_t *, int);
+extern void zpool_print_unsup_feat(nvlist_t *config);
 
 /*
  * Search for pools to import
@@ -435,6 +447,8 @@ extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue);
 extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal);
+extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname,
+    char *buf, size_t len);
 extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap,
     uint64_t *usedp);
 extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **);
@@ -462,10 +476,19 @@ extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
 #define        ZFS_MOUNTPOINT_NONE     "none"
 #define        ZFS_MOUNTPOINT_LEGACY   "legacy"
 
+#define        ZFS_FEATURE_DISABLED    "disabled"
+#define        ZFS_FEATURE_ENABLED     "enabled"
+#define        ZFS_FEATURE_ACTIVE      "active"
+
+#define        ZFS_UNSUPPORTED_INACTIVE        "inactive"
+#define        ZFS_UNSUPPORTED_READONLY        "readonly"
+
 /*
  * zpool property management
  */
 extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **);
+extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *,
+    size_t);
 extern const char *zpool_prop_default_string(zpool_prop_t);
 extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
 extern const char *zpool_prop_column_name(zpool_prop_t);
index 651e68b..a0cc9d1 100644 (file)
@@ -6,6 +6,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/avl_impl.h \
        $(top_srcdir)/include/sys/bplist.h \
        $(top_srcdir)/include/sys/bpobj.h \
+       $(top_srcdir)/include/sys/bptree.h \
        $(top_srcdir)/include/sys/dbuf.h \
        $(top_srcdir)/include/sys/ddt.h \
        $(top_srcdir)/include/sys/dmu.h \
@@ -53,6 +54,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/zap.h \
        $(top_srcdir)/include/sys/zap_impl.h \
        $(top_srcdir)/include/sys/zap_leaf.h \
+       $(top_srcdir)/include/sys/zfeature.h \
        $(top_srcdir)/include/sys/zfs_acl.h \
        $(top_srcdir)/include/sys/zfs_context.h \
        $(top_srcdir)/include/sys/zfs_ctldir.h \
index 443597d..dbc91ea 100644 (file)
@@ -109,6 +109,7 @@ int arc_released(arc_buf_t *buf);
 int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
+boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
index 3771a95..af975c7 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_BPOBJ_H
@@ -67,7 +68,9 @@ typedef struct bpobj {
 typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
 void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
 
 int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
 void bpobj_close(bpobj_t *bpo);
diff --git a/include/sys/bptree.h b/include/sys/bptree.h
new file mode 100644 (file)
index 0000000..9715072
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef        _SYS_BPTREE_H
+#define        _SYS_BPTREE_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bptree_phys {
+       uint64_t bt_begin;
+       uint64_t bt_end;
+       uint64_t bt_bytes;
+       uint64_t bt_comp;
+       uint64_t bt_uncomp;
+} bptree_phys_t;
+
+typedef struct bptree_entry_phys {
+       blkptr_t be_bp;
+       uint64_t be_birth_txg; /* only delete blocks born after this txg */
+       zbookmark_t be_zb; /* holds traversal resume point if needed */
+} bptree_entry_phys_t;
+
+typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
+int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
+
+int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free,
+    bptree_itor_t func, void *arg, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPTREE_H */
index fe317c8..7fc876b 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
@@ -71,6 +71,53 @@ typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 
+typedef enum dmu_object_byteswap {
+       DMU_BSWAP_UINT8,
+       DMU_BSWAP_UINT16,
+       DMU_BSWAP_UINT32,
+       DMU_BSWAP_UINT64,
+       DMU_BSWAP_ZAP,
+       DMU_BSWAP_DNODE,
+       DMU_BSWAP_OBJSET,
+       DMU_BSWAP_ZNODE,
+       DMU_BSWAP_OLDACL,
+       DMU_BSWAP_ACL,
+       /*
+        * Allocating a new byteswap type number makes the on-disk format
+        * incompatible with any other format that uses the same number.
+        *
+        * Data can usually be structured to work with one of the
+        * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
+        */
+       DMU_BSWAP_NUMFUNCS
+} dmu_object_byteswap_t;
+
+#define        DMU_OT_NEWTYPE 0x80
+#define        DMU_OT_METADATA 0x40
+#define        DMU_OT_BYTESWAP_MASK 0x3f
+
+/*
+ * Defines a uint8_t object type. Object types specify if the data
+ * in the object is metadata (boolean) and how to byteswap the data
+ * (dmu_object_byteswap_t).
+ */
+#define        DMU_OT(byteswap, metadata) \
+       (DMU_OT_NEWTYPE | \
+       ((metadata) ? DMU_OT_METADATA : 0) | \
+       ((byteswap) & DMU_OT_BYTESWAP_MASK))
+
+#define        DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+       ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
+       (ot) < DMU_OT_NUMTYPES)
+
+#define        DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+       ((ot) & DMU_OT_METADATA) : \
+       dmu_ot[(int)(ot)].ot_metadata)
+
+#define        DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+       ((ot) & DMU_OT_BYTESWAP_MASK) : \
+       dmu_ot[(int)(ot)].ot_byteswap)
+
 typedef enum dmu_object_type {
        DMU_OT_NONE,
        /* general: */
@@ -135,7 +182,35 @@ typedef enum dmu_object_type {
        DMU_OT_DEADLIST_HDR,            /* UINT64 */
        DMU_OT_DSL_CLONES,              /* ZAP */
        DMU_OT_BPOBJ_SUBOBJ,            /* UINT64 */
-       DMU_OT_NUMTYPES
+       /*
+        * Do not allocate new object types here. Doing so makes the on-disk
+        * format incompatible with any other format that uses the same object
+        * type number.
+        *
+        * When creating an object which does not have one of the above types
+        * use the DMU_OTN_* type with the correct byteswap and metadata
+        * values.
+        *
+        * The DMU_OTN_* types do not have entries in the dmu_ot table,
+        * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
+        * of indexing into dmu_ot directly (this works for both DMU_OT_* types
+        * and DMU_OTN_* types).
+        */
+       DMU_OT_NUMTYPES,
+
+       /*
+        * Names for valid types declared with DMU_OT().
+        */
+       DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
+       DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
+       DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
+       DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
+       DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
+       DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
+       DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
+       DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
+       DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
+       DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
 typedef enum dmu_objset_type {
@@ -215,6 +290,9 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
  */
 #define        DMU_POOL_DIRECTORY_OBJECT       1
 #define        DMU_POOL_CONFIG                 "config"
+#define        DMU_POOL_FEATURES_FOR_WRITE     "features_for_write"
+#define        DMU_POOL_FEATURES_FOR_READ      "features_for_read"
+#define        DMU_POOL_FEATURE_DESCRIPTIONS   "feature_descriptions"
 #define        DMU_POOL_ROOT_DATASET           "root_dataset"
 #define        DMU_POOL_SYNC_BPOBJ             "sync_bplist"
 #define        DMU_POOL_ERRLOG_SCRUB           "errlog_scrub"
@@ -230,6 +308,8 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define        DMU_POOL_CREATION_VERSION       "creation_version"
 #define        DMU_POOL_SCAN                   "scan"
 #define        DMU_POOL_FREE_BPOBJ             "free_bpobj"
+#define        DMU_POOL_BPTREE_OBJ             "bptree_obj"
+#define        DMU_POOL_EMPTY_BPOBJ            "empty_bpobj"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -490,7 +570,7 @@ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
- * zero, the range from offset to end-of-file is freed.
+ * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
        uint64_t size, dmu_tx_t *tx);
@@ -564,12 +644,18 @@ typedef struct dmu_object_info {
 typedef void arc_byteswap_func_t(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
-       arc_byteswap_func_t     *ot_byteswap;
+       dmu_object_byteswap_t   ot_byteswap;
        boolean_t               ot_metadata;
        char                    *ot_name;
 } dmu_object_type_info_t;
 
+typedef struct dmu_object_byteswap_info {
+       arc_byteswap_func_t     *ob_func;
+       char                    *ob_name;
+} dmu_object_byteswap_info_t;
+
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
index 5b326cd..3cbf42f 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DMU_TRAVERSE_H
@@ -54,6 +55,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
 int traverse_dataset(struct dsl_dataset *ds,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_t *resume, int flags,
+    blkptr_cb_t func, void *arg);
 int traverse_pool(spa_t *spa,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
 
index 38ce3c5..547951c 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
@@ -86,7 +86,12 @@ typedef struct dsl_dataset_phys {
        uint64_t ds_creation_time;      /* seconds since 1970 */
        uint64_t ds_creation_txg;
        uint64_t ds_deadlist_obj;       /* DMU_OT_DEADLIST */
-       uint64_t ds_used_bytes;
+       /*
+        * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
+        * include all blocks referenced by this dataset, including those
+        * shared with any other datasets.
+        */
+       uint64_t ds_referenced_bytes;
        uint64_t ds_compressed_bytes;
        uint64_t ds_uncompressed_bytes;
        uint64_t ds_unique_bytes;       /* only relevant to snapshots */
index 40e9610..ff5df14 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_POOL_H
@@ -34,6 +35,7 @@
 #include <sys/ddt.h>
 #include <sys/arc.h>
 #include <sys/bpobj.h>
+#include <sys/bptree.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -48,7 +50,8 @@ struct dsl_scan;
 
 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define        DMU_OT_DEFERRED DMU_OT_NONE
-#define        DMU_OT_TOTAL    DMU_OT_NUMTYPES
+#define        DMU_OT_OTHER    DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
+#define        DMU_OT_TOTAL    (DMU_OT_NUMTYPES + 1)
 
 typedef struct zfs_blkstat {
        uint64_t        zb_count;
@@ -87,12 +90,13 @@ typedef struct dsl_pool {
 
        /* No lock needed - sync context only */
        blkptr_t dp_meta_rootbp;
-       list_t dp_synced_datasets;
        hrtime_t dp_read_overhead;
        uint64_t dp_throughput; /* bytes per millisec */
        uint64_t dp_write_limit;
        uint64_t dp_tmp_userrefs_obj;
        bpobj_t dp_free_bpobj;
+       uint64_t dp_bptree_obj;
+       uint64_t dp_empty_bpobj;
 
        struct dsl_scan *dp_scan;
 
@@ -100,6 +104,9 @@ typedef struct dsl_pool {
        kmutex_t dp_lock;
        uint64_t dp_space_towrite[TXG_SIZE];
        uint64_t dp_tempreserved[TXG_SIZE];
+       uint64_t dp_mos_used_delta;
+       uint64_t dp_mos_compressed_delta;
+       uint64_t dp_mos_uncompressed_delta;
        uint64_t dp_txg_history_size;
        list_t dp_txg_history;
 
@@ -107,6 +114,7 @@ typedef struct dsl_pool {
        /* Has its own locking */
        tx_state_t dp_tx;
        txg_list_t dp_dirty_datasets;
+       txg_list_t dp_dirty_zilogs;
        txg_list_t dp_dirty_dirs;
        txg_list_t dp_sync_tasks;
 
@@ -121,7 +129,8 @@ typedef struct dsl_pool {
        zfs_all_blkstats_t *dp_blkstats;
 } dsl_pool_t;
 
-int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+int dsl_pool_open(dsl_pool_t *dp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
@@ -145,6 +154,8 @@ int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
 void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp);
 
 taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
 
index c79666e..5691f4d 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_SCAN_H
@@ -79,6 +80,9 @@ typedef struct dsl_scan {
        uint64_t scn_sync_start_time;
        zio_t *scn_zio_root;
 
+       /* for freeing blocks */
+       boolean_t scn_is_bptree;
+
        /* for debugging / information */
        uint64_t scn_visited_this_txg;
 
index f72c74f..137dd39 100644 (file)
@@ -170,6 +170,7 @@ typedef enum {
        ZPOOL_PROP_ASHIFT,
        ZPOOL_PROP_COMMENT,
        ZPOOL_PROP_EXPANDSZ,
+       ZPOOL_PROP_FREEING,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -244,6 +245,8 @@ const char *zpool_prop_to_name(zpool_prop_t);
 const char *zpool_prop_default_string(zpool_prop_t);
 uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
+boolean_t zpool_prop_feature(const char *);
+boolean_t zpool_prop_unsupported(const char *);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
 uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
@@ -356,6 +359,7 @@ typedef enum {
 #define        SPA_VERSION_26                  26ULL
 #define        SPA_VERSION_27                  27ULL
 #define        SPA_VERSION_28                  28ULL
+#define        SPA_VERSION_5000                5000ULL
 
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
@@ -363,8 +367,8 @@ typedef enum {
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define        SPA_VERSION                     SPA_VERSION_28
-#define        SPA_VERSION_STRING              "28"
+#define        SPA_VERSION                     SPA_VERSION_5000
+#define        SPA_VERSION_STRING              "5000"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -415,6 +419,12 @@ typedef enum {
 #define        SPA_VERSION_DEADLISTS           SPA_VERSION_26
 #define        SPA_VERSION_FAST_SNAP           SPA_VERSION_27
 #define        SPA_VERSION_MULTI_REPLACE       SPA_VERSION_28
+#define        SPA_VERSION_BEFORE_FEATURES     SPA_VERSION_28
+#define        SPA_VERSION_FEATURES            SPA_VERSION_5000
+
+#define        SPA_VERSION_IS_SUPPORTED(v) \
+       (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
+       ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -512,6 +522,12 @@ typedef struct zpool_rewind_policy {
 #define        ZPOOL_CONFIG_BOOTFS             "bootfs"        /* not stored on disk */
 #define        ZPOOL_CONFIG_MISSING_DEVICES    "missing_vdevs" /* not stored on disk */
 #define        ZPOOL_CONFIG_LOAD_INFO          "load_info"     /* not stored on disk */
+#define        ZPOOL_CONFIG_REWIND_INFO        "rewind_info"   /* not stored on disk */
+#define        ZPOOL_CONFIG_UNSUP_FEAT         "unsup_feat"    /* not stored on disk */
+#define        ZPOOL_CONFIG_ENABLED_FEAT       "enabled_feat"  /* not stored on disk */
+#define        ZPOOL_CONFIG_CAN_RDONLY         "can_rdonly"    /* not stored on disk */
+#define        ZPOOL_CONFIG_FEATURES_FOR_READ  "features_for_read"
+#define        ZPOOL_CONFIG_FEATURE_STATS      "feature_stats" /* not stored on disk */
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, such
@@ -590,6 +606,7 @@ typedef enum vdev_aux {
        VDEV_AUX_BAD_LABEL,     /* the label is OK but invalid          */
        VDEV_AUX_VERSION_NEWER, /* on-disk version is too new           */
        VDEV_AUX_VERSION_OLDER, /* on-disk version is too old           */
+       VDEV_AUX_UNSUP_FEAT,    /* unsupported features                 */
        VDEV_AUX_SPARED,        /* hot spare used in another pool       */
        VDEV_AUX_ERR_EXCEEDED,  /* too many errors                      */
        VDEV_AUX_IO_FAILURE,    /* experienced I/O failure              */
@@ -938,6 +955,7 @@ typedef enum history_internal_events {
        LOG_DS_USER_HOLD,
        LOG_DS_USER_RELEASE,
        LOG_POOL_SPLIT,
+       LOG_POOL_GUID_CHANGE,
        LOG_END
 } history_internal_events_t;
 
index 30ff4e0..cc399fd 100644 (file)
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_NVPAIR_H
 #define        _SYS_NVPAIR_H
 
 #include <sys/types.h>
+#include <sys/time.h>
 #include <sys/errno.h>
 #include <sys/va_list.h>
 
@@ -274,6 +276,73 @@ int nvpair_value_hrtime(nvpair_t *, hrtime_t *);
 int nvpair_value_double(nvpair_t *, double *);
 #endif
 
+nvlist_t *fnvlist_alloc(void);
+void fnvlist_free(nvlist_t *);
+size_t fnvlist_size(nvlist_t *);
+char *fnvlist_pack(nvlist_t *, size_t *);
+void fnvlist_pack_free(char *, size_t);
+nvlist_t *fnvlist_unpack(char *, size_t);
+nvlist_t *fnvlist_dup(nvlist_t *);
+void fnvlist_merge(nvlist_t *, nvlist_t *);
+
+void fnvlist_add_boolean(nvlist_t *, const char *);
+void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
+void fnvlist_add_byte(nvlist_t *, const char *, uchar_t);
+void fnvlist_add_int8(nvlist_t *, const char *, int8_t);
+void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t);
+void fnvlist_add_int16(nvlist_t *, const char *, int16_t);
+void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t);
+void fnvlist_add_int32(nvlist_t *, const char *, int32_t);
+void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t);
+void fnvlist_add_int64(nvlist_t *, const char *, int64_t);
+void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t);
+void fnvlist_add_string(nvlist_t *, const char *, const char *);
+void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
+void fnvlist_add_nvpair(nvlist_t *, nvpair_t *);
+void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
+void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
+void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
+void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
+void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
+void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
+void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
+void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
+void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
+void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
+void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t);
+void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
+
+void fnvlist_remove(nvlist_t *, const char *);
+void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *);
+
+nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name);
+boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name);
+boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name);
+uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name);
+int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name);
+int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name);
+int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name);
+int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name);
+uint8_t fnvlist_lookup_uint8(nvlist_t *nvl, const char *name);
+uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name);
+uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name);
+uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name);
+char *fnvlist_lookup_string(nvlist_t *nvl, const char *name);
+nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name);
+
+boolean_t fnvpair_value_boolean_value(nvpair_t *nvp);
+uchar_t fnvpair_value_byte(nvpair_t *nvp);
+int8_t fnvpair_value_int8(nvpair_t *nvp);
+int16_t fnvpair_value_int16(nvpair_t *nvp);
+int32_t fnvpair_value_int32(nvpair_t *nvp);
+int64_t fnvpair_value_int64(nvpair_t *nvp);
+uint8_t fnvpair_value_uint8(nvpair_t *nvp);
+uint16_t fnvpair_value_uint16(nvpair_t *nvp);
+uint32_t fnvpair_value_uint32(nvpair_t *nvp);
+uint64_t fnvpair_value_uint64(nvpair_t *nvp);
+char *fnvpair_value_string(nvpair_t *nvp);
+nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp);
+
 #ifdef __cplusplus
 }
 #endif
index 28bb4e1..8211722 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -94,7 +94,7 @@ struct dsl_pool;
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
-#define        SPA_CONFIG_BLOCKSIZE    (1 << 14)
+#define        SPA_CONFIG_BLOCKSIZE    (1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
@@ -262,7 +262,7 @@ typedef struct blkptr {
                DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define        BP_GET_UCSIZE(bp) \
-       ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+       ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
        BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define        BP_GET_NDVAS(bp)        \
@@ -404,8 +404,8 @@ typedef struct blkptr {
 #include <sys/dmu.h>
 
 #define        BP_GET_BUFC_TYPE(bp)                                            \
-       (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
-       ARC_BUFC_METADATA : ARC_BUFC_DATA);
+       (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
+       ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
        SPA_IMPORT_EXISTING,
@@ -416,8 +416,8 @@ typedef enum spa_import_type {
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
-extern int spa_get_stats(const char *pool, nvlist_t **config,
-    char *altroot, size_t buflen);
+extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
+    size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     const char *history_str, nvlist_t *zplprops);
 extern int spa_import_rootpool(char *devpath, char *devid);
@@ -574,6 +574,7 @@ extern void spa_claim_notify(zio_t *zio);
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern boolean_t spa_is_initializing(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
@@ -605,6 +606,8 @@ extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 
 /* Miscellaneous support routines */
+extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
+extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern int spa_rename(const char *oldname, const char *newname);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
index d367486..65edc97 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -127,6 +127,7 @@ struct spa {
        uint64_t        spa_import_flags;       /* import specific flags */
        taskq_t         *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
        dsl_pool_t      *spa_dsl_pool;
+       boolean_t       spa_is_initializing;    /* true while opening pool */
        metaslab_class_t *spa_normal_class;     /* normal data class */
        metaslab_class_t *spa_log_class;        /* intent log data class */
        uint64_t        spa_first_txg;          /* first txg after spa_open() */
@@ -140,10 +141,12 @@ struct spa {
        vdev_t          *spa_root_vdev;         /* top-level vdev container */
        uint64_t        spa_config_guid;        /* config pool guid */
        uint64_t        spa_load_guid;          /* spa_load initialized guid */
+       uint64_t        spa_last_synced_guid;   /* last synced guid */
        list_t          spa_config_dirty_list;  /* vdevs with dirty config */
        list_t          spa_state_dirty_list;   /* vdevs with dirty state */
        spa_aux_vdev_t  spa_spares;             /* hot spares */
        spa_aux_vdev_t  spa_l2cache;            /* L2ARC cache devices */
+       nvlist_t        *spa_label_features;    /* Features for reading MOS */
        uint64_t        spa_config_object;      /* MOS object for pool config */
        uint64_t        spa_config_generation;  /* config generation number */
        uint64_t        spa_syncing_txg;        /* txg currently syncing */
@@ -220,7 +223,10 @@ struct spa {
        boolean_t       spa_autoreplace;        /* autoreplace set in open */
        int             spa_vdev_locks;         /* locks grabbed */
        uint64_t        spa_creation_version;   /* version at pool creation */
-       uint64_t        spa_prev_software_version;
+       uint64_t        spa_prev_software_version; /* See ub_software_version */
+       uint64_t        spa_feat_for_write_obj; /* required to write to pool */
+       uint64_t        spa_feat_for_read_obj;  /* required to read from pool */
+       uint64_t        spa_feat_desc_obj;      /* Feature descriptions */
        /*
         * spa_refcnt & spa_config_lock must be the last elements
         * because refcount_t changes size based on compilation options.
index 2f87d74..f9d6dd4 100644 (file)
@@ -22,6 +22,9 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_TXG_H
 #define        _SYS_TXG_H
@@ -121,7 +124,7 @@ extern void txg_wait_callbacks(struct dsl_pool *dp);
 
 extern void txg_list_create(txg_list_t *tl, size_t offset);
 extern void txg_list_destroy(txg_list_t *tl);
-extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
 extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
 extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
index 0055783..8f297a9 100644 (file)
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
@@ -140,8 +141,8 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd);
-extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
+extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 
 typedef enum {
        VDEV_LABEL_CREATE,      /* create/add a new device */
index 0b532dc..4133f2c 100644 (file)
@@ -210,7 +210,7 @@ struct vdev {
         * For DTrace to work in userland (libzpool) context, these fields must
         * remain at the end of the structure.  DTrace will use the kernel's
         * CTF definition for 'struct vdev', and since the size of a kmutex_t is
-        * larger in userland, the offsets for the rest fields would be
+        * larger in userland, the offsets for the rest of the fields would be
         * incorrect.
         */
        kmutex_t        vdev_dtl_lock;  /* vdev_dtl_{map,resilver}      */
@@ -265,6 +265,7 @@ typedef struct vdev_label {
 #define        VDEV_LABEL_START_SIZE   (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
 #define        VDEV_LABEL_END_SIZE     (2 * sizeof (vdev_label_t))
 #define        VDEV_LABELS             4
+#define        VDEV_BEST_LABEL         VDEV_LABELS
 
 #define        VDEV_ALLOC_LOAD         0
 #define        VDEV_ALLOC_ADD          1
index 6237f8b..092669c 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_ZAP_H
@@ -132,6 +133,8 @@ uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
 uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
+    uint64_t parent_obj, const char *name, dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
@@ -297,13 +300,11 @@ int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
 /* Here the key is an int and the value is a different int. */
 int zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t *valuep);
 
-/*
- * They name is a stringified version of key; increment its value by
- * delta.  Zero values will be zap_remove()-ed.
- */
 int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
     dmu_tx_t *tx);
 
diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h
new file mode 100644 (file)
index 0000000..481e85b
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZFEATURE_H
+#define        _SYS_ZFEATURE_H
+
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include "zfeature_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern boolean_t feature_is_supported(objset_t *os, uint64_t obj,
+    uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat);
+
+struct spa;
+extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *);
+extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *);
+extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *);
+extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *);
+extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *);
+extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFEATURE_H */
index c583887..f786f0c 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -454,6 +455,7 @@ extern void zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
 extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void    zil_destroy(zilog_t *zilog, boolean_t keep_first);
+extern void    zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void    zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t   *zil_itx_create(uint64_t txtype, size_t lrsize);
index 6c37c1a..f5b69b7 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -131,6 +132,7 @@ struct zilog {
        zil_header_t    zl_old_header;  /* debugging aid */
        uint_t          zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
        uint_t          zl_prev_rotor;  /* rotor for zl_prev[] */
+       txg_node_t      zl_dirty_link;  /* protected by dp_dirty_zilogs list */
 };
 
 typedef struct zil_bp_node {
index 289238c..0527979 100644 (file)
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _ZIO_H
@@ -278,6 +279,14 @@ typedef struct zbookmark {
 #define        ZB_ZIL_OBJECT           (0ULL)
 #define        ZB_ZIL_LEVEL            (-2LL)
 
+#define        ZB_IS_ZERO(zb)                                          \
+       ((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&        \
+       (zb)->zb_level == 0 && (zb)->zb_blkid == 0)
+#define        ZB_IS_ROOT(zb)                          \
+       ((zb)->zb_object == ZB_ROOT_OBJECT &&   \
+       (zb)->zb_level == ZB_ROOT_LEVEL &&      \
+       (zb)->zb_blkid == ZB_ROOT_BLKID)
+
 typedef struct zio_prop {
        enum zio_checksum       zp_checksum;
        enum zio_compress       zp_compress;
@@ -295,6 +304,7 @@ typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;                          /* defined in zio_checksum.h */
+struct dnode_phys;
 
 struct zio_cksum_report {
        struct zio_cksum_report *zcr_next;
@@ -567,6 +577,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
+/* zbookmark functions */
+boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
+    const zbookmark_t *zb1, const zbookmark_t *zb2);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
new file mode 100644 (file)
index 0000000..cb1d02f
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifndef _ZFEATURE_COMMON_H
+#define        _ZFEATURE_COMMON_H
+
+#include <sys/fs/zfs.h>
+#include <sys/inttypes.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zfeature_info;
+
+typedef struct zfeature_info {
+       const char *fi_uname;   /* User-facing feature name */
+       const char *fi_guid;    /* On-disk feature identifier */
+       const char *fi_desc;    /* Feature description */
+       boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */
+       boolean_t fi_mos;       /* Is the feature necessary to read the MOS? */
+       struct zfeature_info **fi_depends; /* array; null terminated */
+} zfeature_info_t;
+
+typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
+
+#define        ZFS_FEATURE_DEBUG
+
+typedef enum spa_feature {
+       SPA_FEATURE_ASYNC_DESTROY,
+       SPA_FEATURE_EMPTY_BPOBJ,
+       SPA_FEATURES
+} spa_feature_t;
+
+extern zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+extern boolean_t zfeature_is_valid_guid(const char *);
+
+extern boolean_t zfeature_is_supported(const char *);
+extern int zfeature_lookup_guid(const char *, zfeature_info_t **res);
+extern int zfeature_lookup_name(const char *, zfeature_info_t **res);
+
+extern void zpool_feature_init(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFEATURE_COMMON_H */
index 467f461..7a8f96a 100644 (file)
@@ -12,7 +12,8 @@ libnvpair_la_SOURCES = \
        $(top_srcdir)/lib/libnvpair/libnvpair.c \
        $(top_srcdir)/lib/libnvpair/nvpair_alloc_system.c \
        $(top_srcdir)/module/nvpair/nvpair_alloc_fixed.c \
-       $(top_srcdir)/module/nvpair/nvpair.c
+       $(top_srcdir)/module/nvpair/nvpair.c \
+       $(top_srcdir)/module/nvpair/fnvpair.c
 
 libnvpair_la_LIBADD = \
         $(top_builddir)/lib/libuutil/libuutil.la
index 606a919..b852cb6 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <unistd.h>
@@ -803,6 +804,10 @@ dump_nvlist(nvlist_t *list, int indent)
 
        while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
                switch (nvpair_type(elem)) {
+               case DATA_TYPE_BOOLEAN:
+                       (void) printf("%*s%s\n", indent, "", nvpair_name(elem));
+                       break;
+
                case DATA_TYPE_BOOLEAN_VALUE:
                        (void) nvpair_value_boolean_value(elem, &bool_value);
                        (void) printf("%*s%s: %s\n", indent, "",
index b36dee1..ee94fe1 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+/*
  * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
  * single packed nvlist.  While it would be nice to just read in this
  * file from userland, this wouldn't work from a local zone.  So we have to have
@@ -218,6 +223,36 @@ zpool_get_config(zpool_handle_t *zhp, nvlist_t **oldconfig)
 }
 
 /*
+ * Retrieves a list of enabled features and their refcounts and caches it in
+ * the pool handle.
+ */
+nvlist_t *
+zpool_get_features(zpool_handle_t *zhp)
+{
+       nvlist_t *config, *features;
+
+       config = zpool_get_config(zhp, NULL);
+
+       if (config == NULL || !nvlist_exists(config,
+           ZPOOL_CONFIG_FEATURE_STATS)) {
+               int error;
+               boolean_t missing = B_FALSE;
+
+               error = zpool_refresh_stats(zhp, &missing);
+
+               if (error != 0 || missing)
+                       return (NULL);
+
+               config = zpool_get_config(zhp, NULL);
+       }
+
+       verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+           &features) == 0);
+
+       return (features);
+}
+
+/*
  * Refresh the vdev statistics associated with the given pool.  This is used in
  * iostat to show configuration changes and determine the delta from the last
  * time the function was called.  This function can fail, in case the pool has
index ad343e8..22e46b4 100644 (file)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -434,8 +434,8 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
        uint_t i, nspares, nl2cache;
        boolean_t config_seen;
        uint64_t best_txg;
-       char *name, *hostname, *comment;
-       uint64_t version, guid;
+       char *name, *hostname = NULL;
+       uint64_t guid;
        uint_t children = 0;
        nvlist_t **child = NULL;
        uint_t holes;
@@ -521,61 +521,54 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
                                 * configuration:
                                 *
                                 *      version
-                                *      pool guid
-                                *      name
+                                *      pool guid
+                                *      name
+                                *      pool txg (if available)
                                 *      comment (if available)
-                                *      pool state
+                                *      pool state
                                 *      hostid (if available)
                                 *      hostname (if available)
                                 */
-                               uint64_t state;
+                               uint64_t state, version, pool_txg;
+                               char *comment = NULL;
+
+                               version = fnvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_VERSION);
+                               fnvlist_add_uint64(config,
+                                   ZPOOL_CONFIG_VERSION, version);
+                               guid = fnvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_POOL_GUID);
+                               fnvlist_add_uint64(config,
+                                   ZPOOL_CONFIG_POOL_GUID, guid);
+                               name = fnvlist_lookup_string(tmp,
+                                   ZPOOL_CONFIG_POOL_NAME);
+                               fnvlist_add_string(config,
+                                   ZPOOL_CONFIG_POOL_NAME, name);
 
-                               verify(nvlist_lookup_uint64(tmp,
-                                   ZPOOL_CONFIG_VERSION, &version) == 0);
-                               if (nvlist_add_uint64(config,
-                                   ZPOOL_CONFIG_VERSION, version) != 0)
-                                       goto nomem;
-                               verify(nvlist_lookup_uint64(tmp,
-                                   ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
-                               if (nvlist_add_uint64(config,
-                                   ZPOOL_CONFIG_POOL_GUID, guid) != 0)
-                                       goto nomem;
-                               verify(nvlist_lookup_string(tmp,
-                                   ZPOOL_CONFIG_POOL_NAME, &name) == 0);
-                               if (nvlist_add_string(config,
-                                   ZPOOL_CONFIG_POOL_NAME, name) != 0)
-                                       goto nomem;
+                               if (nvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_POOL_TXG, &pool_txg) == 0)
+                                       fnvlist_add_uint64(config,
+                                           ZPOOL_CONFIG_POOL_TXG, pool_txg);
 
-                               /*
-                                * COMMENT is optional, don't bail if it's not
-                                * there, instead, set it to NULL.
-                                */
                                if (nvlist_lookup_string(tmp,
-                                   ZPOOL_CONFIG_COMMENT, &comment) != 0)
-                                       comment = NULL;
-                               else if (nvlist_add_string(config,
-                                   ZPOOL_CONFIG_COMMENT, comment) != 0)
-                                       goto nomem;
+                                   ZPOOL_CONFIG_COMMENT, &comment) == 0)
+                                       fnvlist_add_string(config,
+                                           ZPOOL_CONFIG_COMMENT, comment);
 
-                               verify(nvlist_lookup_uint64(tmp,
-                                   ZPOOL_CONFIG_POOL_STATE, &state) == 0);
-                               if (nvlist_add_uint64(config,
-                                   ZPOOL_CONFIG_POOL_STATE, state) != 0)
-                                       goto nomem;
+                               state = fnvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_POOL_STATE);
+                               fnvlist_add_uint64(config,
+                                   ZPOOL_CONFIG_POOL_STATE, state);
 
                                hostid = 0;
                                if (nvlist_lookup_uint64(tmp,
                                    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-                                       if (nvlist_add_uint64(config,
-                                           ZPOOL_CONFIG_HOSTID, hostid) != 0)
-                                               goto nomem;
-                                       verify(nvlist_lookup_string(tmp,
-                                           ZPOOL_CONFIG_HOSTNAME,
-                                           &hostname) == 0);
-                                       if (nvlist_add_string(config,
-                                           ZPOOL_CONFIG_HOSTNAME,
-                                           hostname) != 0)
-                                               goto nomem;
+                                       fnvlist_add_uint64(config,
+                                           ZPOOL_CONFIG_HOSTID, hostid);
+                                       hostname = fnvlist_lookup_string(tmp,
+                                           ZPOOL_CONFIG_HOSTNAME);
+                                       fnvlist_add_string(config,
+                                           ZPOOL_CONFIG_HOSTNAME, hostname);
                                }
 
                                config_seen = B_TRUE;
index e2a67c6..eca1dc3 100644 (file)
@@ -45,6 +45,7 @@
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 #include "zfs_comutil.h"
+#include "zfeature_common.h"
 
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
@@ -273,6 +274,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
                case ZPOOL_PROP_SIZE:
                case ZPOOL_PROP_ALLOCATED:
                case ZPOOL_PROP_FREE:
+               case ZPOOL_PROP_FREEING:
                case ZPOOL_PROP_EXPANDSZ:
                case ZPOOL_PROP_ASHIFT:
                        (void) zfs_nicenum(intval, buf, len);
@@ -299,6 +301,12 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
                        (void) strlcpy(buf, zpool_state_to_name(intval,
                            vs->vs_aux), len);
                        break;
+               case ZPOOL_PROP_VERSION:
+                       if (intval >= SPA_VERSION_FEATURES) {
+                               (void) snprintf(buf, len, "-");
+                               break;
+                       }
+                       /* FALLTHROUGH */
                default:
                        (void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
                }
@@ -403,10 +411,48 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
        while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
                const char *propname = nvpair_name(elem);
 
+               prop = zpool_name_to_prop(propname);
+               if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) {
+                       int err;
+                       zfeature_info_t *feature;
+                       char *fname = strchr(propname, '@') + 1;
+
+                       err = zfeature_lookup_name(fname, &feature);
+                       if (err != 0) {
+                               ASSERT3U(err, ==, ENOENT);
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "invalid feature '%s'"), fname);
+                               (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+                               goto error;
+                       }
+
+                       if (nvpair_type(elem) != DATA_TYPE_STRING) {
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "'%s' must be a string"), propname);
+                               (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+                               goto error;
+                       }
+
+                       (void) nvpair_value_string(elem, &strval);
+                       if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) {
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "property '%s' can only be set to "
+                                   "'enabled'"), propname);
+                               (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+                               goto error;
+                       }
+
+                       if (nvlist_add_uint64(retprops, propname, 0) != 0) {
+                               (void) no_memory(hdl);
+                               goto error;
+                       }
+                       continue;
+               }
+
                /*
                 * Make sure this property is valid and applies to this type.
                 */
-               if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) {
+               if (prop == ZPROP_INVAL) {
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "invalid property '%s'"), propname);
                        (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
@@ -431,7 +477,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
                default:
                        break;
                case ZPOOL_PROP_VERSION:
-                       if (intval < version || intval > SPA_VERSION) {
+                       if (intval < version ||
+                           !SPA_VERSION_IS_SUPPORTED(intval)) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "property '%s' number %d is invalid."),
                                    propname, intval);
@@ -673,10 +720,79 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
        libzfs_handle_t *hdl = zhp->zpool_hdl;
        zprop_list_t *entry;
        char buf[ZFS_MAXPROPLEN];
+       nvlist_t *features = NULL;
+       nvpair_t *nvp;
+       zprop_list_t **last;
+       boolean_t firstexpand = (NULL == *plp);
+       int i;
 
        if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
                return (-1);
 
+       last = plp;
+       while (*last != NULL)
+               last = &(*last)->pl_next;
+
+       if ((*plp)->pl_all)
+               features = zpool_get_features(zhp);
+
+       if ((*plp)->pl_all && firstexpand) {
+               for (i = 0; i < SPA_FEATURES; i++) {
+                       zprop_list_t *entry = zfs_alloc(hdl,
+                           sizeof (zprop_list_t));
+                       entry->pl_prop = ZPROP_INVAL;
+                       entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
+                           spa_feature_table[i].fi_uname);
+                       entry->pl_width = strlen(entry->pl_user_prop);
+                       entry->pl_all = B_TRUE;
+
+                       *last = entry;
+                       last = &entry->pl_next;
+               }
+       }
+
+       /* add any unsupported features */
+       for (nvp = nvlist_next_nvpair(features, NULL);
+           nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
+               char *propname;
+               boolean_t found;
+               zprop_list_t *entry;
+
+               if (zfeature_is_supported(nvpair_name(nvp)))
+                       continue;
+
+               propname = zfs_asprintf(hdl, "unsupported@%s",
+                   nvpair_name(nvp));
+
+               /*
+                * Before adding the property to the list make sure that no
+                * other pool already added the same property.
+                */
+               found = B_FALSE;
+               entry = *plp;
+               while (entry != NULL) {
+                       if (entry->pl_user_prop != NULL &&
+                           strcmp(propname, entry->pl_user_prop) == 0) {
+                               found = B_TRUE;
+                               break;
+                       }
+                       entry = entry->pl_next;
+               }
+               if (found) {
+                       free(propname);
+                       continue;
+               }
+
+               entry = zfs_alloc(hdl, sizeof (zprop_list_t));
+               entry->pl_prop = ZPROP_INVAL;
+               entry->pl_user_prop = propname;
+               entry->pl_width = strlen(entry->pl_user_prop);
+               entry->pl_all = B_TRUE;
+
+               *last = entry;
+               last = &entry->pl_next;
+       }
+
        for (entry = *plp; entry != NULL; entry = entry->pl_next) {
 
                if (entry->pl_fixed)
@@ -693,6 +809,66 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
        return (0);
 }
 
+/*
+ * Get the state for the given feature on the given ZFS pool.
+ */
+int
+zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
+    size_t len)
+{
+       uint64_t refcount;
+       boolean_t found = B_FALSE;
+       nvlist_t *features = zpool_get_features(zhp);
+       boolean_t supported;
+       const char *feature = strchr(propname, '@') + 1;
+
+       supported = zpool_prop_feature(propname);
+       ASSERT(supported || zpool_prop_unsupported(propname));
+
+       /*
+        * Convert from feature name to feature guid. This conversion is
+        * unecessary for unsupported@... properties because they already
+        * use guids.
+        */
+       if (supported) {
+               int ret;
+               zfeature_info_t *fi;
+
+               ret = zfeature_lookup_name(feature, &fi);
+               if (ret != 0) {
+                       (void) strlcpy(buf, "-", len);
+                       return (ENOTSUP);
+               }
+               feature = fi->fi_guid;
+       }
+
+       if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
+               found = B_TRUE;
+
+       if (supported) {
+               if (!found) {
+                       (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
+               } else  {
+                       if (refcount == 0)
+                               (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
+                       else
+                               (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
+               }
+       } else {
+               if (found) {
+                       if (refcount == 0) {
+                               (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
+                       } else {
+                               (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
+                       }
+               } else {
+                       (void) strlcpy(buf, "-", len);
+                       return (ENOTSUP);
+               }
+       }
+
+       return (0);
+}
 
 /*
  * Don't start the slice at the default block of 34; many storage
@@ -1291,8 +1467,10 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
        if (!hdl->libzfs_printerr || config == NULL)
                return;
 
-       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0)
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+           nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
                return;
+       }
 
        if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
                return;
@@ -1349,6 +1527,7 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
 
        /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
        if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+           nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 ||
            nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
                goto no_info;
 
@@ -1473,6 +1652,31 @@ print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
        }
 }
 
+void
+zpool_print_unsup_feat(nvlist_t *config)
+{
+       nvlist_t *nvinfo, *unsup_feat;
+       nvpair_t *nvp;
+
+       verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
+           0);
+       verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
+           &unsup_feat) == 0);
+
+       for (nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
+           nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
+               char *desc;
+
+               verify(nvpair_type(nvp) == DATA_TYPE_STRING);
+               verify(nvpair_value_string(nvp, &desc) == 0);
+
+               if (strlen(desc) > 0)
+                       (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
+               else
+                       (void) printf("\t%s\n", nvpair_name(nvp));
+       }
+}
+
 /*
  * Import the given pool using the known configuration and a list of
  * properties to be set. The configuration should have come from
@@ -1579,6 +1783,22 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 
                switch (error) {
                case ENOTSUP:
+                       if (nv != NULL && nvlist_lookup_nvlist(nv,
+                           ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+                           nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
+                               (void) printf(dgettext(TEXT_DOMAIN, "This "
+                                   "pool uses the following feature(s) not "
+                                   "supported by this system:\n"));
+                               zpool_print_unsup_feat(nv);
+                               if (nvlist_exists(nvinfo,
+                                   ZPOOL_CONFIG_CAN_RDONLY)) {
+                                       (void) printf(dgettext(TEXT_DOMAIN,
+                                           "All unsupported features are only "
+                                           "required for writing to the pool."
+                                           "\nThe pool can be imported using "
+                                           "'-o readonly=on'.\n"));
+                               }
+                       }
                        /*
                         * Unsupported version.
                         */
index d56baf0..e6e9230 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -42,6 +44,7 @@
 #include <string.h>
 #include <unistd.h>
 #include "libzfs_impl.h"
+#include "zfeature_common.h"
 
 /*
  * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
@@ -215,6 +218,20 @@ check_status(nvlist_t *config, boolean_t isimport)
                return (ZPOOL_STATUS_VERSION_NEWER);
 
        /*
+        * Unsupported feature(s).
+        */
+       if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+           vs->vs_aux == VDEV_AUX_UNSUP_FEAT) {
+               nvlist_t *nvinfo;
+
+               verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+                   &nvinfo) == 0);
+               if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY))
+                       return (ZPOOL_STATUS_UNSUP_FEAT_WRITE);
+               return (ZPOOL_STATUS_UNSUP_FEAT_READ);
+       }
+
+       /*
         * Check that the config is complete.
         */
        if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
@@ -301,9 +318,33 @@ check_status(nvlist_t *config, boolean_t isimport)
        /*
         * Outdated, but usable, version
         */
-       if (version < SPA_VERSION)
+       if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION)
                return (ZPOOL_STATUS_VERSION_OLDER);
 
+       /*
+        * Usable pool with disabled features
+        */
+       if (version >= SPA_VERSION_FEATURES) {
+               int i;
+               nvlist_t *feat;
+
+               if (isimport) {
+                       feat = fnvlist_lookup_nvlist(config,
+                           ZPOOL_CONFIG_LOAD_INFO);
+                       feat = fnvlist_lookup_nvlist(feat,
+                           ZPOOL_CONFIG_ENABLED_FEAT);
+               } else {
+                       feat = fnvlist_lookup_nvlist(config,
+                           ZPOOL_CONFIG_FEATURE_STATS);
+               }
+
+               for (i = 0; i < SPA_FEATURES; i++) {
+                       zfeature_info_t *fi = &spa_feature_table[i];
+                       if (!nvlist_exists(feat, fi->fi_guid))
+                               return (ZPOOL_STATUS_FEAT_DISABLED);
+               }
+       }
+
        return (ZPOOL_STATUS_OK);
 }
 
index 4270087..a4e1255 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -47,6 +48,7 @@
 
 #include "libzfs_impl.h"
 #include "zfs_prop.h"
+#include "zfeature_common.h"
 
 int
 libzfs_errno(libzfs_handle_t *hdl)
@@ -114,7 +116,8 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_RESILVERING:
                return (dgettext(TEXT_DOMAIN, "currently resilvering"));
        case EZFS_BADVERSION:
-               return (dgettext(TEXT_DOMAIN, "unsupported version"));
+               return (dgettext(TEXT_DOMAIN, "unsupported version or "
+                   "feature"));
        case EZFS_POOLUNAVAIL:
                return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
        case EZFS_DEVOVERFLOW:
@@ -709,6 +712,7 @@ libzfs_init(void)
 
        zfs_prop_init();
        zpool_prop_init();
+       zpool_feature_init();
        libzfs_mnttab_init(hdl);
 
        return (hdl);
@@ -1532,9 +1536,11 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
         * this is a pool property or if this isn't a user-defined
         * dataset property,
         */
-       if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL ||
-           (!zfs_prop_user(propname) && !zfs_prop_userquota(propname) &&
-           !zfs_prop_written(propname)))) {
+       if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
+           !zpool_prop_feature(propname) &&
+           !zpool_prop_unsupported(propname)) ||
+           (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
+           !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                    "invalid property '%s'"), propname);
                return (zfs_error(hdl, EZFS_BADPROP,
@@ -1546,7 +1552,8 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
 
        entry->pl_prop = prop;
        if (prop == ZPROP_INVAL) {
-               if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) {
+               if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
+                   NULL) {
                        free(entry);
                        return (-1);
                }
index cbe3acd..1285af3 100644 (file)
@@ -23,6 +23,7 @@ libzpool_la_SOURCES = \
        $(top_srcdir)/module/zfs/arc.c \
        $(top_srcdir)/module/zfs/bplist.c \
        $(top_srcdir)/module/zfs/bpobj.c \
+       $(top_srcdir)/module/zfs/bptree.c \
        $(top_srcdir)/module/zfs/dbuf.c \
        $(top_srcdir)/module/zfs/ddt.c \
        $(top_srcdir)/module/zfs/ddt_zap.c \
@@ -74,6 +75,8 @@ libzpool_la_SOURCES = \
        $(top_srcdir)/module/zfs/zap.c \
        $(top_srcdir)/module/zfs/zap_leaf.c \
        $(top_srcdir)/module/zfs/zap_micro.c \
+       $(top_srcdir)/module/zfs/zfeature.c \
+       $(top_srcdir)/module/zfs/zfeature_common.c \
        $(top_srcdir)/module/zfs/zfs_byteswap.c \
        $(top_srcdir)/module/zfs/zfs_debug.c \
        $(top_srcdir)/module/zfs/zfs_fm.c \
index c38efd0..0e10c89 100644 (file)
@@ -647,7 +647,9 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
                 * To simulate partial disk writes, we split writes into two
                 * system calls so that the process can be killed in between.
                 */
-               split = (len > 0 ? rand() % len : 0);
+               int sectors = len >> SPA_MINBLOCKSHIFT;
+               split = (sectors > 0 ? rand() % sectors : 0) <<
+                   SPA_MINBLOCKSHIFT;
                rc = pwrite64(vp->v_fd, addr, split, offset);
                if (rc != -1) {
                        done = rc;
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
new file mode 100644 (file)
index 0000000..0ab179e
--- /dev/null
@@ -0,0 +1,201 @@
+'\" te
+.\" Copyright (c) 2012 by Delphix. All rights reserved.
+.\" The contents of this file are subject to the terms of the Common Development
+.\" and Distribution License (the "License").  You may not use this file except
+.\" in compliance with the License. You can obtain a copy of the license at
+.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
+.\"
+.\" See the License for the specific language governing permissions and
+.\" limitations under the License. When distributing Covered Code, include this
+.\" CDDL HEADER in each file and include the License file at
+.\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
+.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
+.\" own identifying information:
+.\" Portions Copyright [yyyy] [name of copyright owner]
+.TH ZPOOL-FEATURES 5 "Mar 16, 2012"
+.SH NAME
+zpool\-features \- ZFS pool feature descriptions
+.SH DESCRIPTION
+.sp
+.LP
+ZFS pool on\-disk format versions are specified via "features" which replace
+the old on\-disk format numbers (the last supported on\-disk format number is
+28). To enable a feature on a pool use the \fBupgrade\fR subcommand of the
+\fBzpool\fR(1M) command, or set the \fBfeature@\fR\fIfeature_name\fR property
+to \fBenabled\fR.
+.sp
+.LP
+The pool format does not affect file system version compatibility or the ability
+to send file systems between pools.
+.sp
+.LP
+Since most features can be enabled independently of each other the on\-disk
+format of the pool is specified by the set of all features marked as
+\fBactive\fR on the pool. If the pool was created by another software version
+this set may include unsupported features.
+.SS "Identifying features"
+.sp
+.LP
+Every feature has a guid of the form \fIcom.example:feature_name\fR. The reverse
+DNS name ensures that the feature's guid is unique across all ZFS
+implementations. When unsupported features are encountered on a pool they will
+be identified by their guids. Refer to the documentation for the ZFS
+implementation that created the pool for information about those features.
+.sp
+.LP
+Each supported feature also has a short name. By convention a feature's short
+name is the portion of its guid which follows the ':' (e.g.
+\fIcom.example:feature_name\fR would have the short name \fIfeature_name\fR),
+however a feature's short name may differ across ZFS implementations if
+following the convention would result in name conflicts.
+.SS "Feature states"
+.sp
+.LP
+Features can be in one of three states:
+.sp
+.ne 2
+.na
+\fB\fBactive\fR\fR
+.ad
+.RS 12n
+This feature's on\-disk format changes are in effect on the pool. Support for
+this feature is required to import the pool in read\-write mode. If this
+feature is not read-only compatible, support is also required to import the pool
+in read\-only mode (see "Read\-only compatibility").
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBenabled\fR\fR
+.ad
+.RS 12n
+An administrator has marked this feature as enabled on the pool, but the
+feature's on\-disk format changes have not been made yet. The pool can still be
+imported by software that does not support this feature, but changes may be made
+to the on\-disk format at any time which will move the feature to the
+\fBactive\fR state. Some features may support returning to the \fBenabled\fR
+state after becoming \fBactive\fR. See feature\-specific documentation for
+details.
+.RE
+
+.sp
+.ne 2
+.na
+\fBdisabled\fR
+.ad
+.RS 12n
+This feature's on\-disk format changes have not been made and will not be made
+unless an administrator moves the feature to the \fBenabled\fR state. Features
+cannot be disabled once they have been enabled.
+.RE
+
+.sp
+.LP
+The state of supported features is exposed through pool properties of the form
+\fIfeature@short_name\fR.
+.SS "Read\-only compatibility"
+.sp
+.LP
+Some features may make on\-disk format changes that do not interfere with other
+software's ability to read from the pool. These features are referred to as
+"read\-only compatible". If all unsupported features on a pool are read\-only
+compatible, the pool can be imported in read\-only mode by setting the
+\fBreadonly\fR property during import (see \fBzpool\fR(1M) for details on
+importing pools).
+.SS "Unsupported features"
+.sp
+.LP
+For each unsupported feature enabled on an imported pool a pool property
+named \fIunsupported@feature_guid\fR will indicate why the import was allowed
+despite the unsupported feature. Possible values for this property are:
+
+.sp
+.ne 2
+.na
+\fB\fBinactive\fR\fR
+.ad
+.RS 12n
+The feature is in the \fBenabled\fR state and therefore the pool's on\-disk
+format is still compatible with software that does not support this feature.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBreadonly\fR\fR
+.ad
+.RS 12n
+The feature is read\-only compatible and the pool has been imported in
+read\-only mode.
+.RE
+
+.SS "Feature dependencies"
+.sp
+.LP
+Some features depend on other features being enabled in order to function
+properly. Enabling a feature will automatically enable any features it
+depends on.
+.SH FEATURES
+.sp
+.LP
+The following features are supported on this system:
+.sp
+.ne 2
+.na
+\fB\fBasync_destroy\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   com.delphix:async_destroy
+READ\-ONLY COMPATIBLE  yes
+DEPENDENCIES   none
+.TE
+
+Destroying a file system requires traversing all of its data in order to
+return its used space to the pool. Without \fBasync_destroy\fR the file system
+is not fully removed until all space has been reclaimed. If the destroy
+operation is interrupted by a reboot or power outage the next attempt to open
+the pool will need to complete the destroy operation synchronously.
+
+When \fBasync_destroy\fR is enabled the file system's data will be reclaimed
+by a background process, allowing the destroy operation to complete without
+traversing the entire file system. The background process is able to resume
+interrupted destroys after the pool has been opened, eliminating the need
+to finish interrupted destroys as part of the open operation. The amount
+of space remaining to be reclaimed by the background process is available
+through the \fBfreeing\fR property.
+
+This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBempty_bpobj\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   com.delphix:empty_bpobj
+READ\-ONLY COMPATIBLE  yes
+DEPENDENCIES   none
+.TE
+
+This feature increases the performance of creating and using a large
+number of snapshots of a single filesystem or volume, and also reduces
+the disk space required.
+
+When there are many snapshots, each snapshot uses many Block Pointer
+Objects (bpobj's) to track blocks associated with that snapshot.
+However, in common use cases, most of these bpobj's are empty.  This
+feature allows us to create each bpobj on-demand, thus eliminating the
+empty bpobjs.
+
+This feature is \fBactive\fR while there are any filesystems, volumes,
+or snapshots which were created after enabling this feature.
+.RE
+
+.SH "SEE ALSO"
+\fBzpool\fR(1M)
index 35feda7..d5f61cf 100644 (file)
@@ -3,10 +3,19 @@
 .\" Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 .\" Copyright (c) 2012 by Delphix. All Rights Reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
-.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
-.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the
-.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH zpool 8 "2 August 2012" "ZFS pool 28, filesystem 5" "System Administration Commands"
+.\" The contents of this file are subject to the terms of the Common Development
+.\" and Distribution License (the "License"). You may not use this file except
+.\" in compliance with the License. You can obtain a copy of the license at
+.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
+.\"
+.\" See the License for the specific language governing permissions and
+.\" limitations under the License. When distributing Covered Code, include this
+.\" CDDL HEADER in each file and include the License file at
+.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this
+.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
+.\" own identifying information:
+.\" Portions Copyright [yyyy] [name of copyright owner]
+.TH zpool 8 "14 December 2012" "ZFS pool 28, filesystem 5" "System Administration Commands"
 .SH NAME
 zpool \- configures ZFS storage pools
 .SH SYNOPSIS
@@ -32,7 +41,7 @@ zpool \- configures ZFS storage pools
 
 .LP
 .nf
-\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] 
+\fBzpool create\fR [\fB-fnd\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR]
      ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...
 .fi
 
@@ -467,24 +476,34 @@ Percentage of pool space used. This property can also be referred to by its shor
 .ne 2
 .mk
 .na
-\fB\fBcomment\fR\fR
+\fB\fBexpandsize\fR\fR
 .ad
 .RS 20n
-.rt
-A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted.  An administrator can provide additional information about a pool using this property.
+Amount of uninitialized space within the pool or device that can be used to
+increase the total capacity of the pool.  Uninitialized space consists of
+any space on an EFI labeled vdev which has not been brought online
+(i.e. zpool online -e).  This space occurs when a LUN is dynamically expanded.
 .RE
 
 .sp
 .ne 2
-.mk
 .na
-\fB\fBexpandsize\fR\fR
+\fB\fBfree\fR\fR
 .ad
 .RS 20n
-Amount of uninitialized space within the pool or device that can be used to
-increase the total capacity of the pool.  Uninitialized space consists of 
-any space on an EFI labeled vdev which has not been brought online 
-(i.e. zpool online -e).  This space occurs when a LUN is dynamically expanded.
+The amount of free space available in the pool.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBfreeing\fR\fR
+.ad
+.RS 20n
+After a file system or snapshot is destroyed, the space it was using is
+returned to the pool asynchronously. \fB\fBfreeing\fR\fR is the amount of
+space remaining to be reclaimed. Over time \fB\fBfreeing\fR\fR will decrease
+while \fB\fBfree\fR\fR increases.
 .RE
 
 .sp
@@ -521,6 +540,16 @@ Total size of the storage pool.
 
 .sp
 .ne 2
+.na
+\fB\fBunsupported@\fR\fIfeature_guid\fR\fR
+.ad
+.RS 20n
+Information about unsupported features that are enabled on the pool. See
+\fBzpool-features\fR(5) for details.
+.RE
+
+.sp
+.ne 2
 .mk
 .na
 \fB\fBused\fR\fR
@@ -532,7 +561,7 @@ Amount of storage space used within the pool.
 
 .sp
 .LP
-These space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(8) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
+The space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(8) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
 
 .sp
 .LP
@@ -617,6 +646,17 @@ Multiple pools can share the same cache file. Because the kernel destroys and re
 .ne 2
 .mk
 .na
+\fB\fBcomment\fR=\fB\fItext\fR\fR
+.ad
+.sp .6
+.RS 4n
+A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted.  An administrator can provide additional information about a pool using this property.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
 \fB\fBdelegation\fR=\fBon\fR | \fBoff\fR\fR
 .ad
 .sp .6
@@ -670,6 +710,18 @@ Prints out a message to the console and generates a system crash dump.
 
 .sp
 .ne 2
+.na
+\fB\fBfeature@\fR\fIfeature_name\fR=\fBenabled\fR\fR
+.ad
+.RS 4n
+The value of this property is the current state of \fIfeature_name\fR. The
+only valid value when setting this property is \fBenabled\fR which moves
+\fIfeature_name\fR to the enabled state. See \fBzpool-features\fR(5) for
+details on feature states.
+.RE
+
+.sp
+.ne 2
 .mk
 .na
 \fB\fBlistsnaps\fR=on | off\fR
@@ -687,7 +739,7 @@ Controls whether information about snapshots associated with this pool is output
 .ad
 .sp .6
 .RS 4n
-The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the "\fBzpool upgrade\fR" command, though this property can be used when a specific version is needed for backwards compatibility. This property can be any number between 1 and the current version reported by "\fBzpool upgrade -v\fR".
+The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the "\fBzpool upgrade\fR" command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value.
 .RE
 
 .SS "Subcommands"
@@ -801,7 +853,7 @@ Clears device errors in a pool. If no arguments are specified, all device errors
 .ne 2
 .mk
 .na
-\fB\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR
+\fB\fBzpool create\fR [\fB-fnd\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR
 .ad
 .sp .6
 .RS 4n
@@ -813,6 +865,8 @@ The command also checks that the replication strategy for the pool is consistent
 .sp
 Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option.
 .sp
+By default all supported features are enabled on the new pool unless the \fB-d\fR option is specified.
+.sp
 .ne 2
 .mk
 .na
@@ -838,6 +892,16 @@ Displays the configuration that would be used without actually creating the pool
 .ne 2
 .mk
 .na
+\fB\fB-d\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do not enable any features on the new pool. Individual features can be enabled by setting their corresponding properties to \fBenabled\fR with the \fB-o\fR option. See \fBzpool-features\fR(5) for details about feature properties.
+.RE
+
+.sp
+.ne 2
+.na
 \fB\fB-o\fR \fIproperty=value\fR [\fB-o\fR \fIproperty=value\fR] ...\fR
 .ad
 .sp .6
@@ -1564,7 +1628,7 @@ Displays verbose data error information, printing out a complete list of all dat
 .ad
 .sp .6
 .RS 4n
-Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with a more recent version are also displayed, although these pools will be inaccessible on the system.
+Displays pools which do not have all supported features enabled and pools formatted using a legacy ZFS version number. These pools can continue to be used, but some features may not be available. Use "\fBzpool upgrade -a\fR" to enable all features on all pools.
 .RE
 
 .sp
@@ -1575,7 +1639,7 @@ Displays all pools formatted using a different \fBZFS\fR on-disk version. Older
 .ad
 .sp .6
 .RS 4n
-Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supported versions are displayed, along with an explanation of the features provided with each version.
+Displays legacy \fBZFS\fR versions supported by the current software. See \fBzfs-features\fR(5) for a description of feature flags features supported by the current software.
 .RE
 
 .sp
@@ -1586,7 +1650,7 @@ Displays \fBZFS\fR versions supported by the current software. The current \fBZF
 .ad
 .sp .6
 .RS 4n
-Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software.
+Enables all supported features on the given pool. Once this is done, the pool will no longer be accessible on systems that do not support feature flags. See \fBzfs-features\fR(5) for details on compatability with systems that support feature flags, but do not support all features enabled on the pool.
 .sp
 .ne 2
 .mk
@@ -1594,8 +1658,8 @@ Upgrades the given pool to the latest on-disk version. Once this is done, the po
 \fB\fB-a\fR\fR
 .ad
 .RS 14n
-.rt  
-Upgrades all pools.
+.rt
+Enables all supported features on all pools.
 .RE
 
 .sp
@@ -1605,8 +1669,8 @@ Upgrades all pools.
 \fB\fB-V\fR \fIversion\fR\fR
 .ad
 .RS 14n
-.rt  
-Upgrade to the specified version. If the \fB-V\fR flag is not specified, the pool is upgraded to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software.
+.rt
+Upgrade to the specified legacy version. If the \fB-V\fR flag is specified, no features will be enabled on the pool. This option can only be used to increase the version number up to the last supported legacy version number.
 .RE
 
 .RE
@@ -1965,4 +2029,4 @@ Invalid command line options were specified.
 .SH SEE ALSO
 .sp
 .LP
-\fBzfs\fR(8)
+\fBzfs\fR(8), \fBzpool-features\fR(5)
index b53381f..211fc72 100644 (file)
@@ -5,5 +5,6 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
 $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair.o
+$(MODULE)-objs += @top_srcdir@/module/nvpair/fnvpair.o
 $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_spl.o
 $(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_fixed.o
diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c
new file mode 100644 (file)
index 0000000..fa28afc
--- /dev/null
@@ -0,0 +1,566 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/nvpair.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#ifndef _KERNEL
+#include <stdlib.h>
+#endif
+
+/*
+ * "Force" nvlist wrapper.
+ *
+ * These functions wrap the nvlist_* functions with assertions that assume
+ * the operation is successful.  This allows the caller's code to be much
+ * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
+ * functions, which can return the requested value (rather than filling in
+ * a pointer).
+ *
+ * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
+ * with KM_SLEEP.
+ *
+ * More wrappers should be added as needed -- for example
+ * nvlist_lookup_*_array and nvpair_value_*_array.
+ */
+
+nvlist_t *
+fnvlist_alloc(void)
+{
+       nvlist_t *nvl;
+       VERIFY3U(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP), ==, 0);
+       return (nvl);
+}
+
+void
+fnvlist_free(nvlist_t *nvl)
+{
+       nvlist_free(nvl);
+}
+
+size_t
+fnvlist_size(nvlist_t *nvl)
+{
+       size_t size;
+       VERIFY3U(nvlist_size(nvl, &size, NV_ENCODE_NATIVE), ==, 0);
+       return (size);
+}
+
+/*
+ * Returns allocated buffer of size *sizep.  Caller must free the buffer with
+ * fnvlist_pack_free().
+ */
+char *
+fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+{
+       char *packed = 0;
+       VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
+           KM_SLEEP), ==, 0);
+       return (packed);
+}
+
+/*ARGSUSED*/
+void
+fnvlist_pack_free(char *pack, size_t size)
+{
+#ifdef _KERNEL
+       kmem_free(pack, size);
+#else
+       free(pack);
+#endif
+}
+
+nvlist_t *
+fnvlist_unpack(char *buf, size_t buflen)
+{
+       nvlist_t *rv;
+       VERIFY3U(nvlist_unpack(buf, buflen, &rv, KM_SLEEP), ==, 0);
+       return (rv);
+}
+
+nvlist_t *
+fnvlist_dup(nvlist_t *nvl)
+{
+       nvlist_t *rv;
+       VERIFY3U(nvlist_dup(nvl, &rv, KM_SLEEP), ==, 0);
+       return (rv);
+}
+
+void
+fnvlist_merge(nvlist_t *dst, nvlist_t *src)
+{
+       VERIFY3U(nvlist_merge(dst, src, KM_SLEEP), ==, 0);
+}
+
+void
+fnvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+       VERIFY3U(nvlist_add_boolean(nvl, name), ==, 0);
+}
+
+void
+fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+       VERIFY3U(nvlist_add_boolean_value(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+       VERIFY3U(nvlist_add_byte(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+       VERIFY3U(nvlist_add_int8(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+       VERIFY3U(nvlist_add_uint8(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+       VERIFY3U(nvlist_add_int16(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+       VERIFY3U(nvlist_add_uint16(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+       VERIFY3U(nvlist_add_int32(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+       VERIFY3U(nvlist_add_uint32(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+       VERIFY3U(nvlist_add_int64(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+       VERIFY3U(nvlist_add_uint64(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+       VERIFY3U(nvlist_add_string(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+       VERIFY3U(nvlist_add_nvlist(nvl, name, val), ==, 0);
+}
+
+void
+fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+       VERIFY3U(nvlist_add_nvpair(nvl, pair), ==, 0);
+}
+
+void
+fnvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+    boolean_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_boolean_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_byte_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_int8_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_uint8_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_int16_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_uint16_array(nvlist_t *nvl, const char *name,
+    uint16_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_uint16_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_int32_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_uint32_array(nvlist_t *nvl, const char *name,
+    uint32_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_uint32_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_int64_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_uint64_array(nvlist_t *nvl, const char *name,
+    uint64_t *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_uint64_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_string_array(nvlist_t *nvl, const char *name,
+    char * const *val, uint_t n)
+{
+       VERIFY3U(nvlist_add_string_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
+    nvlist_t **val, uint_t n)
+{
+       VERIFY3U(nvlist_add_nvlist_array(nvl, name, val, n), ==, 0);
+}
+
+void
+fnvlist_remove(nvlist_t *nvl, const char *name)
+{
+       VERIFY3U(nvlist_remove_all(nvl, name), ==, 0);
+}
+
+void
+fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+       VERIFY3U(nvlist_remove_nvpair(nvl, pair), ==, 0);
+}
+
+nvpair_t *
+fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name)
+{
+       nvpair_t *rv;
+       VERIFY3U(nvlist_lookup_nvpair(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+/* returns B_TRUE if the entry exists */
+boolean_t
+fnvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+       return (nvlist_lookup_boolean(nvl, name) == 0);
+}
+
+boolean_t
+fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name)
+{
+       boolean_t rv;
+       VERIFY3U(nvlist_lookup_boolean_value(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+uchar_t
+fnvlist_lookup_byte(nvlist_t *nvl, const char *name)
+{
+       uchar_t rv;
+       VERIFY3U(nvlist_lookup_byte(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+int8_t
+fnvlist_lookup_int8(nvlist_t *nvl, const char *name)
+{
+       int8_t rv;
+       VERIFY3U(nvlist_lookup_int8(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+int16_t
+fnvlist_lookup_int16(nvlist_t *nvl, const char *name)
+{
+       int16_t rv;
+       VERIFY3U(nvlist_lookup_int16(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+int32_t
+fnvlist_lookup_int32(nvlist_t *nvl, const char *name)
+{
+       int32_t rv;
+       VERIFY3U(nvlist_lookup_int32(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+int64_t
+fnvlist_lookup_int64(nvlist_t *nvl, const char *name)
+{
+       int64_t rv;
+       VERIFY3U(nvlist_lookup_int64(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+uint8_t
+fnvlist_lookup_uint8(nvlist_t *nvl, const char *name)
+{
+       uint8_t rv;
+       VERIFY3U(nvlist_lookup_uint8(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+uint16_t
+fnvlist_lookup_uint16(nvlist_t *nvl, const char *name)
+{
+       uint16_t rv;
+       VERIFY3U(nvlist_lookup_uint16(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+uint32_t
+fnvlist_lookup_uint32(nvlist_t *nvl, const char *name)
+{
+       uint32_t rv;
+       VERIFY3U(nvlist_lookup_uint32(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+uint64_t
+fnvlist_lookup_uint64(nvlist_t *nvl, const char *name)
+{
+       uint64_t rv;
+       VERIFY3U(nvlist_lookup_uint64(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+char *
+fnvlist_lookup_string(nvlist_t *nvl, const char *name)
+{
+       char *rv;
+       VERIFY3U(nvlist_lookup_string(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+nvlist_t *
+fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name)
+{
+       nvlist_t *rv;
+       VERIFY3U(nvlist_lookup_nvlist(nvl, name, &rv), ==, 0);
+       return (rv);
+}
+
+boolean_t
+fnvpair_value_boolean_value(nvpair_t *nvp)
+{
+       boolean_t rv;
+       VERIFY3U(nvpair_value_boolean_value(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+uchar_t
+fnvpair_value_byte(nvpair_t *nvp)
+{
+       uchar_t rv;
+       VERIFY3U(nvpair_value_byte(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+int8_t
+fnvpair_value_int8(nvpair_t *nvp)
+{
+       int8_t rv;
+       VERIFY3U(nvpair_value_int8(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+int16_t
+fnvpair_value_int16(nvpair_t *nvp)
+{
+       int16_t rv;
+       VERIFY3U(nvpair_value_int16(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+int32_t
+fnvpair_value_int32(nvpair_t *nvp)
+{
+       int32_t rv;
+       VERIFY3U(nvpair_value_int32(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+int64_t
+fnvpair_value_int64(nvpair_t *nvp)
+{
+       int64_t rv;
+       VERIFY3U(nvpair_value_int64(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+uint8_t
+fnvpair_value_uint8(nvpair_t *nvp)
+{
+       uint8_t rv;
+       VERIFY3U(nvpair_value_uint8(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+uint16_t
+fnvpair_value_uint16(nvpair_t *nvp)
+{
+       uint16_t rv;
+       VERIFY3U(nvpair_value_uint16(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+uint32_t
+fnvpair_value_uint32(nvpair_t *nvp)
+{
+       uint32_t rv;
+       VERIFY3U(nvpair_value_uint32(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+uint64_t
+fnvpair_value_uint64(nvpair_t *nvp)
+{
+       uint64_t rv;
+       VERIFY3U(nvpair_value_uint64(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+char *
+fnvpair_value_string(nvpair_t *nvp)
+{
+       char *rv;
+       VERIFY3U(nvpair_value_string(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+nvlist_t *
+fnvpair_value_nvlist(nvpair_t *nvp)
+{
+       nvlist_t *rv;
+       VERIFY3U(nvpair_value_nvlist(nvp, &rv), ==, 0);
+       return (rv);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+
+EXPORT_SYMBOL(fnvlist_alloc);
+EXPORT_SYMBOL(fnvlist_free);
+EXPORT_SYMBOL(fnvlist_size);
+EXPORT_SYMBOL(fnvlist_pack);
+EXPORT_SYMBOL(fnvlist_unpack);
+EXPORT_SYMBOL(fnvlist_dup);
+EXPORT_SYMBOL(fnvlist_merge);
+
+EXPORT_SYMBOL(fnvlist_add_nvpair);
+EXPORT_SYMBOL(fnvlist_add_boolean);
+EXPORT_SYMBOL(fnvlist_add_boolean_value);
+EXPORT_SYMBOL(fnvlist_add_byte);
+EXPORT_SYMBOL(fnvlist_add_int8);
+EXPORT_SYMBOL(fnvlist_add_uint8);
+EXPORT_SYMBOL(fnvlist_add_int16);
+EXPORT_SYMBOL(fnvlist_add_uint16);
+EXPORT_SYMBOL(fnvlist_add_int32);
+EXPORT_SYMBOL(fnvlist_add_uint32);
+EXPORT_SYMBOL(fnvlist_add_int64);
+EXPORT_SYMBOL(fnvlist_add_uint64);
+EXPORT_SYMBOL(fnvlist_add_string);
+EXPORT_SYMBOL(fnvlist_add_nvlist);
+EXPORT_SYMBOL(fnvlist_add_boolean_array);
+EXPORT_SYMBOL(fnvlist_add_byte_array);
+EXPORT_SYMBOL(fnvlist_add_int8_array);
+EXPORT_SYMBOL(fnvlist_add_uint8_array);
+EXPORT_SYMBOL(fnvlist_add_int16_array);
+EXPORT_SYMBOL(fnvlist_add_uint16_array);
+EXPORT_SYMBOL(fnvlist_add_int32_array);
+EXPORT_SYMBOL(fnvlist_add_uint32_array);
+EXPORT_SYMBOL(fnvlist_add_int64_array);
+EXPORT_SYMBOL(fnvlist_add_uint64_array);
+EXPORT_SYMBOL(fnvlist_add_string_array);
+EXPORT_SYMBOL(fnvlist_add_nvlist_array);
+
+EXPORT_SYMBOL(fnvlist_remove);
+EXPORT_SYMBOL(fnvlist_remove_nvpair);
+
+EXPORT_SYMBOL(fnvlist_lookup_nvpair);
+EXPORT_SYMBOL(fnvlist_lookup_boolean);
+EXPORT_SYMBOL(fnvlist_lookup_boolean_value);
+EXPORT_SYMBOL(fnvlist_lookup_byte);
+EXPORT_SYMBOL(fnvlist_lookup_int8);
+EXPORT_SYMBOL(fnvlist_lookup_uint8);
+EXPORT_SYMBOL(fnvlist_lookup_int16);
+EXPORT_SYMBOL(fnvlist_lookup_uint16);
+EXPORT_SYMBOL(fnvlist_lookup_int32);
+EXPORT_SYMBOL(fnvlist_lookup_uint32);
+EXPORT_SYMBOL(fnvlist_lookup_int64);
+EXPORT_SYMBOL(fnvlist_lookup_uint64);
+EXPORT_SYMBOL(fnvlist_lookup_string);
+EXPORT_SYMBOL(fnvlist_lookup_nvlist);
+
+EXPORT_SYMBOL(fnvpair_value_boolean_value);
+EXPORT_SYMBOL(fnvpair_value_byte);
+EXPORT_SYMBOL(fnvpair_value_int8);
+EXPORT_SYMBOL(fnvpair_value_uint8);
+EXPORT_SYMBOL(fnvpair_value_int16);
+EXPORT_SYMBOL(fnvpair_value_uint16);
+EXPORT_SYMBOL(fnvpair_value_int32);
+EXPORT_SYMBOL(fnvpair_value_uint32);
+EXPORT_SYMBOL(fnvpair_value_int64);
+EXPORT_SYMBOL(fnvpair_value_uint64);
+EXPORT_SYMBOL(fnvpair_value_string);
+EXPORT_SYMBOL(fnvpair_value_nvlist);
+
+#endif
index 303edce..1173fc0 100644 (file)
@@ -79,6 +79,8 @@ zpool_prop_init(void)
            ZFS_TYPE_POOL, "<size>", "SIZE");
        zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<size>", "FREE");
+       zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
+           ZFS_TYPE_POOL, "<size>", "FREEING");
        zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
            PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
        zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
@@ -170,6 +172,26 @@ zpool_prop_default_numeric(zpool_prop_t prop)
        return (zpool_prop_table[prop].pd_numdefault);
 }
 
+/*
+ * Returns true if this is a valid feature@ property.
+ */
+boolean_t
+zpool_prop_feature(const char *name)
+{
+       static const char *prefix = "feature@";
+       return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
+ * Returns true if this is a valid unsupported@ property.
+ */
+boolean_t
+zpool_prop_unsupported(const char *name)
+{
+       static const char *prefix = "unsupported@";
+       return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
 int
 zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
     uint64_t *index)
@@ -223,6 +245,8 @@ EXPORT_SYMBOL(zpool_prop_to_name);
 EXPORT_SYMBOL(zpool_prop_default_string);
 EXPORT_SYMBOL(zpool_prop_default_numeric);
 EXPORT_SYMBOL(zpool_prop_readonly);
+EXPORT_SYMBOL(zpool_prop_feature);
+EXPORT_SYMBOL(zpool_prop_unsupported);
 EXPORT_SYMBOL(zpool_prop_index_to_string);
 EXPORT_SYMBOL(zpool_prop_string_to_index);
 #endif
index 98576d1..f1e32a1 100644 (file)
@@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o
@@ -59,6 +60,8 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/zfeature_common.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o
index 6ec9f04..5e21b1b 100644 (file)
@@ -189,6 +189,7 @@ unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_disable_dup_eviction = 0;
 int zfs_arc_meta_prune = 0;
 
 /*
@@ -307,6 +308,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_size;
        kstat_named_t arcstat_l2_hdr_size;
        kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_duplicate_buffers;
+       kstat_named_t arcstat_duplicate_buffers_size;
+       kstat_named_t arcstat_duplicate_reads;
        kstat_named_t arcstat_memory_direct_count;
        kstat_named_t arcstat_memory_indirect_count;
        kstat_named_t arcstat_no_grow;
@@ -387,6 +391,9 @@ static arc_stats_t arc_stats = {
        { "l2_size",                    KSTAT_DATA_UINT64 },
        { "l2_hdr_size",                KSTAT_DATA_UINT64 },
        { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "duplicate_buffers",          KSTAT_DATA_UINT64 },
+       { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
+       { "duplicate_reads",            KSTAT_DATA_UINT64 },
        { "memory_direct_count",        KSTAT_DATA_UINT64 },
        { "memory_indirect_count",      KSTAT_DATA_UINT64 },
        { "arc_no_grow",                KSTAT_DATA_UINT64 },
@@ -1369,6 +1376,17 @@ arc_buf_clone(arc_buf_t *from)
        hdr->b_buf = buf;
        arc_get_data_buf(buf);
        bcopy(from->b_data, buf->b_data, size);
+
+       /*
+        * This buffer already exists in the arc so create a duplicate
+        * copy for the caller.  If the buffer is associated with user data
+        * then track the size and number of duplicates.  These stats will be
+        * updated as duplicate buffers are created and destroyed.
+        */
+       if (hdr->b_type == ARC_BUFC_DATA) {
+               ARCSTAT_BUMP(arcstat_duplicate_buffers);
+               ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+       }
        hdr->b_datacnt += 1;
        return (buf);
 }
@@ -1467,6 +1485,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                ASSERT3U(state->arcs_size, >=, size);
                atomic_add_64(&state->arcs_size, -size);
                buf->b_data = NULL;
+
+               /*
+                * If we're destroying a duplicate buffer make sure
+                * that the appropriate statistics are updated.
+                */
+               if (buf->b_hdr->b_datacnt > 1 &&
+                   buf->b_hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+               }
                ASSERT(buf->b_hdr->b_datacnt > 0);
                buf->b_hdr->b_datacnt -= 1;
        }
@@ -1652,6 +1680,48 @@ arc_buf_size(arc_buf_t *buf)
 }
 
 /*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+       arc_buf_hdr_t *hdr;
+       boolean_t evict_needed = B_FALSE;
+
+       if (zfs_disable_dup_eviction)
+               return (B_FALSE);
+
+       mutex_enter(&buf->b_evict_lock);
+       hdr = buf->b_hdr;
+       if (hdr == NULL) {
+               /*
+                * We are in arc_do_user_evicts(); let that function
+                * perform the eviction.
+                */
+               ASSERT(buf->b_data == NULL);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_FALSE);
+       } else if (buf->b_data == NULL) {
+               /*
+                * We have already been added to the arc eviction list;
+                * recommend eviction.
+                */
+               ASSERT3P(hdr, ==, &arc_eviction_hdr);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_TRUE);
+       }
+
+       if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+               evict_needed = B_TRUE;
+
+       mutex_exit(&buf->b_evict_lock);
+       return (evict_needed);
+}
+
+/*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -2729,9 +2799,11 @@ arc_read_done(zio_t *zio)
        callback_list = hdr->b_acb;
        ASSERT(callback_list != NULL);
        if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
+               dmu_object_byteswap_t bswap =
+                   DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
                arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
                    byteswap_uint64_array :
-                   dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+                   dmu_ot_byteswap[bswap].ob_func;
                func(buf->b_data, hdr->b_size);
        }
 
@@ -2751,8 +2823,10 @@ arc_read_done(zio_t *zio)
        abuf = buf;
        for (acb = callback_list; acb; acb = acb->acb_next) {
                if (acb->acb_done) {
-                       if (abuf == NULL)
+                       if (abuf == NULL) {
+                               ARCSTAT_BUMP(arcstat_duplicate_reads);
                                abuf = arc_buf_clone(buf);
+                       }
                        acb->acb_buf = abuf;
                        abuf = NULL;
                }
@@ -3322,6 +3396,16 @@ arc_release(arc_buf_t *buf, void *tag)
                        ASSERT3U(*size, >=, hdr->b_size);
                        atomic_add_64(size, -hdr->b_size);
                }
+
+               /*
+                * We're releasing a duplicate user data buffer, update
+                * our statistics accordingly.
+                */
+               if (hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+                           -hdr->b_size);
+               }
                hdr->b_datacnt -= 1;
                arc_cksum_verify(buf);
 
index 022921c..d5f8d40 100644 (file)
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bpobj.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+       zfeature_info_t *empty_bpobj_feat =
+           &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+       spa_t *spa = dmu_objset_spa(os);
+       dsl_pool_t *dp = dmu_objset_pool(os);
+
+       if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
+               if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
+                       ASSERT3U(dp->dp_empty_bpobj, ==, 0);
+                       dp->dp_empty_bpobj =
+                           bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+                       VERIFY(zap_add(os,
+                           DMU_POOL_DIRECTORY_OBJECT,
+                           DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+                           &dp->dp_empty_bpobj, tx) == 0);
+               }
+               spa_feature_incr(spa, empty_bpobj_feat, tx);
+               ASSERT(dp->dp_empty_bpobj != 0);
+               return (dp->dp_empty_bpobj);
+       } else {
+               return (bpobj_alloc(os, blocksize, tx));
+       }
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+       zfeature_info_t *empty_bpobj_feat =
+           &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
+       dsl_pool_t *dp = dmu_objset_pool(os);
+
+       spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
+       if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
+               VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_EMPTY_BPOBJ, tx));
+               VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+               dp->dp_empty_bpobj = 0;
+       }
+}
 
 uint64_t
 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
@@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
        int epb;
        dmu_buf_t *dbuf = NULL;
 
+       ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
        VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
 
        mutex_enter(&bpo.bpo_lock);
@@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
        ASSERT(bpo->bpo_havesubobj);
        ASSERT(bpo->bpo_havecomp);
+       ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+       if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+               bpobj_decr_empty(bpo->bpo_os, tx);
+               return;
+       }
 
        VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
        VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
@@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
        blkptr_t *bparray;
 
        ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
        /* We never need the fill count. */
        stored_bp.blk_fill = 0;
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c
new file mode 100644 (file)
index 0000000..8c5a7d4
--- /dev/null
@@ -0,0 +1,224 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/refcount.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+       bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
+       boolean_t ba_free;      /* true if freeing during traversal */
+
+       bptree_itor_t *ba_func; /* function to call for each blockpointer */
+       void *ba_arg;           /* caller supplied argument to ba_func */
+       dmu_tx_t *ba_tx;        /* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+       uint64_t obj;
+       dmu_buf_t *db;
+       bptree_phys_t *bt;
+
+       obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+           SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+           sizeof (bptree_phys_t), tx);
+
+       /*
+        * Bonus buffer contents are already initialized to 0, but for
+        * readability we make it explicit.
+        */
+       VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+       dmu_buf_will_dirty(db, tx);
+       bt = db->db_data;
+       bt->bt_begin = 0;
+       bt->bt_end = 0;
+       bt->bt_bytes = 0;
+       bt->bt_comp = 0;
+       bt->bt_uncomp = 0;
+       dmu_buf_rele(db, FTAG);
+
+       return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+       dmu_buf_t *db;
+       bptree_phys_t *bt;
+
+       VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+       bt = db->db_data;
+       ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+       ASSERT3U(bt->bt_bytes, ==, 0);
+       ASSERT3U(bt->bt_comp, ==, 0);
+       ASSERT3U(bt->bt_uncomp, ==, 0);
+       dmu_buf_rele(db, FTAG);
+
+       return (dmu_object_free(os, obj, tx));
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+       dmu_buf_t *db;
+       bptree_phys_t *bt;
+       bptree_entry_phys_t bte;
+
+       /*
+        * bptree objects are in the pool mos, therefore they can only be
+        * modified in syncing context. Furthermore, this is only modified
+        * by the sync thread, so no locking is necessary.
+        */
+       ASSERT(dmu_tx_is_syncing(tx));
+
+       VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+       bt = db->db_data;
+
+       bte.be_birth_txg = birth_txg;
+       bte.be_bp = *bp;
+       bzero(&bte.be_zb, sizeof (bte.be_zb));
+       dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+
+       dmu_buf_will_dirty(db, tx);
+       bt->bt_end++;
+       bt->bt_bytes += bytes;
+       bt->bt_comp += comp;
+       bt->bt_uncomp += uncomp;
+       dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+       int err;
+       struct bptree_args *ba = arg;
+
+       if (bp == NULL)
+               return (0);
+
+       err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+       if (err == 0 && ba->ba_free) {
+               ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+               ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+               ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+       }
+       return (err);
+}
+
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+    void *arg, dmu_tx_t *tx)
+{
+       int err;
+       uint64_t i;
+       dmu_buf_t *db;
+       struct bptree_args ba;
+
+       ASSERT(!free || dmu_tx_is_syncing(tx));
+
+       err = dmu_bonus_hold(os, obj, FTAG, &db);
+       if (err != 0)
+               return (err);
+
+       if (free)
+               dmu_buf_will_dirty(db, tx);
+
+       ba.ba_phys = db->db_data;
+       ba.ba_free = free;
+       ba.ba_func = func;
+       ba.ba_arg = arg;
+       ba.ba_tx = tx;
+
+       err = 0;
+       for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+               bptree_entry_phys_t bte;
+
+               ASSERT(!free || i == ba.ba_phys->bt_begin);
+
+               err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+                   &bte, DMU_READ_NO_PREFETCH);
+               if (err != 0)
+                       break;
+
+               err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+                   bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST,
+                   bptree_visit_cb, &ba);
+               if (free) {
+                       ASSERT(err == 0 || err == ERESTART);
+                       if (err != 0) {
+                               /* save bookmark for future resume */
+                               ASSERT3U(bte.be_zb.zb_objset, ==,
+                                   ZB_DESTROYED_OBJSET);
+                               ASSERT3U(bte.be_zb.zb_level, ==, 0);
+                               dmu_write(os, obj, i * sizeof (bte),
+                                   sizeof (bte), &bte, tx);
+                               break;
+                       } else {
+                               ba.ba_phys->bt_begin++;
+                               (void) dmu_free_range(os, obj,
+                                   i * sizeof (bte), sizeof (bte), tx);
+                       }
+               }
+       }
+
+       ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+       /* if all blocks are free there should be no used space */
+       if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+               ASSERT3U(ba.ba_phys->bt_bytes, ==, 0);
+               ASSERT3U(ba.ba_phys->bt_comp, ==, 0);
+               ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0);
+       }
+
+       dmu_buf_rele(db, FTAG);
+
+       return (err);
+}
index 1f6fa93..205abaa 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -260,7 +261,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
                boolean_t is_metadata;
 
                DB_DNODE_ENTER(db);
-               is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+               is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
                DB_DNODE_EXIT(db);
 
                return (is_metadata);
@@ -2188,7 +2189,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                        dbuf_evict(db);
                } else {
                        VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-                       if (!DBUF_IS_CACHEABLE(db))
+
+                       /*
+                        * A dbuf will be eligible for eviction if either the
+                        * 'primarycache' property is set or a duplicate
+                        * copy of this buffer is already cached in the arc.
+                        *
+                        * In the case of the 'primarycache' a buffer
+                        * is considered for eviction if it matches the
+                        * criteria set in the property.
+                        *
+                        * To decide if our buffer is considered a
+                        * duplicate, we must call into the arc to determine
+                        * if multiple buffers are referencing the same
+                        * block on-disk. If so, then we simply evict
+                        * ourselves.
+                        */
+                       if (!DBUF_IS_CACHEABLE(db) ||
+                           arc_buf_eviction_needed(db->db_buf))
                                dbuf_clear(db);
                        else
                                mutex_exit(&db->db_mtx);
index 6221157..ef86861 100644 (file)
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1120,11 +1121,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
        ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
 
        if (spa->spa_ddt_stat_object == 0) {
-               spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
-                   DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
-               VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
-                   DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
-                   &spa->spa_ddt_stat_object, tx) == 0);
+               spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+                   DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_DDT_STATS, tx);
        }
 
        while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
index 00a7a07..e856356 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-       {       byteswap_uint8_array,   TRUE,   "unallocated"           },
-       {       zap_byteswap,           TRUE,   "object directory"      },
-       {       byteswap_uint64_array,  TRUE,   "object array"          },
-       {       byteswap_uint8_array,   TRUE,   "packed nvlist"         },
-       {       byteswap_uint64_array,  TRUE,   "packed nvlist size"    },
-       {       byteswap_uint64_array,  TRUE,   "bpobj"                 },
-       {       byteswap_uint64_array,  TRUE,   "bpobj header"          },
-       {       byteswap_uint64_array,  TRUE,   "SPA space map header"  },
-       {       byteswap_uint64_array,  TRUE,   "SPA space map"         },
-       {       byteswap_uint64_array,  TRUE,   "ZIL intent log"        },
-       {       dnode_buf_byteswap,     TRUE,   "DMU dnode"             },
-       {       dmu_objset_byteswap,    TRUE,   "DMU objset"            },
-       {       byteswap_uint64_array,  TRUE,   "DSL directory"         },
-       {       zap_byteswap,           TRUE,   "DSL directory child map"},
-       {       zap_byteswap,           TRUE,   "DSL dataset snap map"  },
-       {       zap_byteswap,           TRUE,   "DSL props"             },
-       {       byteswap_uint64_array,  TRUE,   "DSL dataset"           },
-       {       zfs_znode_byteswap,     TRUE,   "ZFS znode"             },
-       {       zfs_oldacl_byteswap,    TRUE,   "ZFS V0 ACL"            },
-       {       byteswap_uint8_array,   FALSE,  "ZFS plain file"        },
-       {       zap_byteswap,           TRUE,   "ZFS directory"         },
-       {       zap_byteswap,           TRUE,   "ZFS master node"       },
-       {       zap_byteswap,           TRUE,   "ZFS delete queue"      },
-       {       byteswap_uint8_array,   FALSE,  "zvol object"           },
-       {       zap_byteswap,           TRUE,   "zvol prop"             },
-       {       byteswap_uint8_array,   FALSE,  "other uint8[]"         },
-       {       byteswap_uint64_array,  FALSE,  "other uint64[]"        },
-       {       zap_byteswap,           TRUE,   "other ZAP"             },
-       {       zap_byteswap,           TRUE,   "persistent error log"  },
-       {       byteswap_uint8_array,   TRUE,   "SPA history"           },
-       {       byteswap_uint64_array,  TRUE,   "SPA history offsets"   },
-       {       zap_byteswap,           TRUE,   "Pool properties"       },
-       {       zap_byteswap,           TRUE,   "DSL permissions"       },
-       {       zfs_acl_byteswap,       TRUE,   "ZFS ACL"               },
-       {       byteswap_uint8_array,   TRUE,   "ZFS SYSACL"            },
-       {       byteswap_uint8_array,   TRUE,   "FUID table"            },
-       {       byteswap_uint64_array,  TRUE,   "FUID table size"       },
-       {       zap_byteswap,           TRUE,   "DSL dataset next clones"},
-       {       zap_byteswap,           TRUE,   "scan work queue"       },
-       {       zap_byteswap,           TRUE,   "ZFS user/group used"   },
-       {       zap_byteswap,           TRUE,   "ZFS user/group quota"  },
-       {       zap_byteswap,           TRUE,   "snapshot refcount tags"},
-       {       zap_byteswap,           TRUE,   "DDT ZAP algorithm"     },
-       {       zap_byteswap,           TRUE,   "DDT statistics"        },
-       {       byteswap_uint8_array,   TRUE,   "System attributes"     },
-       {       zap_byteswap,           TRUE,   "SA master node"        },
-       {       zap_byteswap,           TRUE,   "SA attr registration"  },
-       {       zap_byteswap,           TRUE,   "SA attr layouts"       },
-       {       zap_byteswap,           TRUE,   "scan translations"     },
-       {       byteswap_uint8_array,   FALSE,  "deduplicated block"    },
-       {       zap_byteswap,           TRUE,   "DSL deadlist map"      },
-       {       byteswap_uint64_array,  TRUE,   "DSL deadlist map hdr"  },
-       {       zap_byteswap,           TRUE,   "DSL dir clones"        },
-       {       byteswap_uint64_array,  TRUE,   "bpobj subobj"          },
+       {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
+       {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
+       {       DMU_BSWAP_UINT64,       TRUE,   "object array"          },
+       {       DMU_BSWAP_UINT8,        TRUE,   "packed nvlist"         },
+       {       DMU_BSWAP_UINT64,       TRUE,   "packed nvlist size"    },
+       {       DMU_BSWAP_UINT64,       TRUE,   "bpobj"                 },
+       {       DMU_BSWAP_UINT64,       TRUE,   "bpobj header"          },
+       {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map header"  },
+       {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map"         },
+       {       DMU_BSWAP_UINT64,       TRUE,   "ZIL intent log"        },
+       {       DMU_BSWAP_DNODE,        TRUE,   "DMU dnode"             },
+       {       DMU_BSWAP_OBJSET,       TRUE,   "DMU objset"            },
+       {       DMU_BSWAP_UINT64,       TRUE,   "DSL directory"         },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL directory child map"},
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset snap map"  },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL props"             },
+       {       DMU_BSWAP_UINT64,       TRUE,   "DSL dataset"           },
+       {       DMU_BSWAP_ZNODE,        TRUE,   "ZFS znode"             },
+       {       DMU_BSWAP_OLDACL,       TRUE,   "ZFS V0 ACL"            },
+       {       DMU_BSWAP_UINT8,        FALSE,  "ZFS plain file"        },
+       {       DMU_BSWAP_ZAP,          TRUE,   "ZFS directory"         },
+       {       DMU_BSWAP_ZAP,          TRUE,   "ZFS master node"       },
+       {       DMU_BSWAP_ZAP,          TRUE,   "ZFS delete queue"      },
+       {       DMU_BSWAP_UINT8,        FALSE,  "zvol object"           },
+       {       DMU_BSWAP_ZAP,          TRUE,   "zvol prop"             },
+       {       DMU_BSWAP_UINT8,        FALSE,  "other uint8[]"         },
+       {       DMU_BSWAP_UINT64,       FALSE,  "other uint64[]"        },
+       {       DMU_BSWAP_ZAP,          TRUE,   "other ZAP"             },
+       {       DMU_BSWAP_ZAP,          TRUE,   "persistent error log"  },
+       {       DMU_BSWAP_UINT8,        TRUE,   "SPA history"           },
+       {       DMU_BSWAP_UINT64,       TRUE,   "SPA history offsets"   },
+       {       DMU_BSWAP_ZAP,          TRUE,   "Pool properties"       },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL permissions"       },
+       {       DMU_BSWAP_ACL,          TRUE,   "ZFS ACL"               },
+       {       DMU_BSWAP_UINT8,        TRUE,   "ZFS SYSACL"            },
+       {       DMU_BSWAP_UINT8,        TRUE,   "FUID table"            },
+       {       DMU_BSWAP_UINT64,       TRUE,   "FUID table size"       },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset next clones"},
+       {       DMU_BSWAP_ZAP,          TRUE,   "scan work queue"       },
+       {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group used"   },
+       {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group quota"  },
+       {       DMU_BSWAP_ZAP,          TRUE,   "snapshot refcount tags"},
+       {       DMU_BSWAP_ZAP,          TRUE,   "DDT ZAP algorithm"     },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DDT statistics"        },
+       {       DMU_BSWAP_UINT8,        TRUE,   "System attributes"     },
+       {       DMU_BSWAP_ZAP,          TRUE,   "SA master node"        },
+       {       DMU_BSWAP_ZAP,          TRUE,   "SA attr registration"  },
+       {       DMU_BSWAP_ZAP,          TRUE,   "SA attr layouts"       },
+       {       DMU_BSWAP_ZAP,          TRUE,   "scan translations"     },
+       {       DMU_BSWAP_UINT8,        FALSE,  "deduplicated block"    },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL deadlist map"      },
+       {       DMU_BSWAP_UINT64,       TRUE,   "DSL deadlist map hdr"  },
+       {       DMU_BSWAP_ZAP,          TRUE,   "DSL dir clones"        },
+       {       DMU_BSWAP_UINT64,       TRUE,   "bpobj subobj"          }
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+       {       byteswap_uint8_array,   "uint8"         },
+       {       byteswap_uint16_array,  "uint16"        },
+       {       byteswap_uint32_array,  "uint32"        },
+       {       byteswap_uint64_array,  "uint64"        },
+       {       zap_byteswap,           "zap"           },
+       {       dnode_buf_byteswap,     "dnode"         },
+       {       dmu_objset_byteswap,    "objset"        },
+       {       zfs_znode_byteswap,     "znode"         },
+       {       zfs_oldacl_byteswap,    "oldacl"        },
+       {       zfs_acl_byteswap,       "acl"           }
 };
 
 int
@@ -176,7 +190,7 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
 
-       if (type > DMU_OT_NUMTYPES) {
+       if (!DMU_OT_IS_VALID(type)) {
                error = EINVAL;
        } else if (dn->dn_bonus != db) {
                error = EINVAL;
@@ -1695,7 +1709,7 @@ void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
        dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
-       boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
+       boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
            (wp & WP_SPILL));
        enum zio_checksum checksum = os->os_checksum;
        enum zio_compress compress = os->os_compress;
@@ -1939,15 +1953,15 @@ dmu_init(void)
        dbuf_init();
        zfetch_init();
        dmu_tx_init();
-       arc_init();
        l2arc_init();
+       arc_init();
 }
 
 void
 dmu_fini(void)
 {
-       l2arc_fini();
        arc_fini();
+       l2arc_fini();
        dmu_tx_fini();
        zfetch_fini();
        dbuf_fini();
index 949f4d7..0cf3c4a 100644 (file)
@@ -1077,8 +1077,8 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
        void *data = NULL;
 
        if (drro->drr_type == DMU_OT_NONE ||
-           drro->drr_type >= DMU_OT_NUMTYPES ||
-           drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+           !DMU_OT_IS_VALID(drro->drr_type) ||
+           !DMU_OT_IS_VALID(drro->drr_bonustype) ||
            drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
            drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
            P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
@@ -1143,7 +1143,9 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
                ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
                bcopy(data, db->db_data, drro->drr_bonuslen);
                if (ra->byteswap) {
-                       dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+                       dmu_object_byteswap_t byteswap =
+                           DMU_OT_BYTESWAP(drro->drr_bonustype);
+                       dmu_ot_byteswap[byteswap].ob_func(db->db_data,
                            drro->drr_bonuslen);
                }
                dmu_buf_rele(db, FTAG);
@@ -1186,7 +1188,7 @@ restore_write(struct restorearg *ra, objset_t *os,
        int err;
 
        if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
-           drrw->drr_type >= DMU_OT_NUMTYPES)
+           !DMU_OT_IS_VALID(drrw->drr_type))
                return (EINVAL);
 
        data = restore_read(ra, drrw->drr_length);
@@ -1205,8 +1207,11 @@ restore_write(struct restorearg *ra, objset_t *os,
                dmu_tx_abort(tx);
                return (err);
        }
-       if (ra->byteswap)
-               dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+       if (ra->byteswap) {
+               dmu_object_byteswap_t byteswap =
+                   DMU_OT_BYTESWAP(drrw->drr_type);
+               dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
+       }
        dmu_write(os, drrw->drr_object,
            drrw->drr_offset, drrw->drr_length, data, tx);
        dmu_tx_commit(tx);
@@ -1604,13 +1609,6 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
        dsl_dataset_t *ds = drc->drc_logical_ds;
        int err, myerr;
 
-       /*
-        * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
-        * expects it to have a ds_user_ptr (and zil), but clone_swap()
-        * can close it.
-        */
-       txg_wait_synced(ds->ds_dir->dd_pool, 0);
-
        if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
                err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
                    drc->drc_force);
index 376f60f..980c1aa 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -53,6 +54,7 @@ typedef struct traverse_data {
        uint64_t td_objset;
        blkptr_t *td_rootbp;
        uint64_t td_min_txg;
+       zbookmark_t *td_resume;
        int td_flags;
        prefetch_data_t *td_pfd;
        blkptr_cb_t *td_func;
@@ -128,6 +130,54 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh)
        zil_free(zilog);
 }
 
+typedef enum resume_skip {
+       RESUME_SKIP_ALL,
+       RESUME_SKIP_NONE,
+       RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+    const zbookmark_t *zb)
+{
+       if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+               /*
+                * If we already visited this bp & everything below,
+                * don't bother doing it again.
+                */
+               if (zbookmark_is_before(dnp, zb, td->td_resume))
+                       return (RESUME_SKIP_ALL);
+
+               /*
+                * If we found the block we're trying to resume from, zero
+                * the bookmark out to indicate that we have resumed.
+                */
+               ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
+               if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+                       bzero(td->td_resume, sizeof (*zb));
+                       if (td->td_flags & TRAVERSE_POST)
+                               return (RESUME_SKIP_CHILDREN);
+               }
+       }
+       return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
+{
+       ASSERT(td->td_resume != NULL);
+       ASSERT3U(zb->zb_level, ==, 0);
+       bcopy(zb, td->td_resume, sizeof (*td->td_resume));
+}
+
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
@@ -137,8 +187,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
        arc_buf_t *buf = NULL;
        prefetch_data_t *pd = td->td_pfd;
        boolean_t hard = td->td_flags & TRAVERSE_HARD;
+       boolean_t pause = B_FALSE;
+
+       switch (resume_skip_check(td, dnp, zb)) {
+       case RESUME_SKIP_ALL:
+               return (0);
+       case RESUME_SKIP_CHILDREN:
+               goto post;
+       case RESUME_SKIP_NONE:
+               break;
+       default:
+               ASSERT(0);
+       }
 
-       if (bp->blk_birth == 0) {
+       if (BP_IS_HOLE(bp)) {
                err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
                    td->td_arg);
                return (err);
@@ -164,8 +226,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                    td->td_arg);
                if (err == TRAVERSE_VISIT_NO_CHILDREN)
                        return (0);
-               if (err)
-                       return (err);
+               if (err == ERESTART)
+                       pause = B_TRUE; /* handle pausing at a common point */
+               if (err != 0)
+                       goto post;
        }
 
        if (BP_GET_LEVEL(bp) > 0) {
@@ -253,9 +317,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
        if (buf)
                (void) arc_buf_remove_ref(buf, &buf);
 
+post:
        if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
                err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
                    td->td_arg);
+               if (err == ERESTART)
+                       pause = B_TRUE;
+       }
+
+       if (pause && td->td_resume != NULL) {
+               ASSERT3U(err, ==, ERESTART);
+               ASSERT(!hard);
+               traverse_pause(td, zb);
        }
 
        return (err != 0 ? err : lasterr);
@@ -353,22 +426,27 @@ traverse_prefetch_thread(void *arg)
  * in syncing context).
  */
 static int
-traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
-    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+    uint64_t txg_start, zbookmark_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
 {
        traverse_data_t *td;
        prefetch_data_t *pd;
        zbookmark_t *czb;
        int err;
 
+       ASSERT(ds == NULL || objset == ds->ds_object);
+       ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
        td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE);
        pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE);
        czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE);
 
        td->td_spa = spa;
-       td->td_objset = ds ? ds->ds_object : 0;
+       td->td_objset = objset;
        td->td_rootbp = rootbp;
        td->td_min_txg = txg_start;
+       td->td_resume = resume;
        td->td_func = func;
        td->td_arg = arg;
        td->td_pfd = pd;
@@ -424,8 +502,17 @@ int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
-       return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
-           &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
+       return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+           &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
+{
+       return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+           blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
@@ -442,8 +529,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
        boolean_t hard = (flags & TRAVERSE_HARD);
 
        /* visit the MOS */
-       err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
-           txg_start, flags, func, arg);
+       err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+           txg_start, NULL, flags, func, arg);
        if (err)
                return (err);
 
index 81c6dfe..47ec4c1 100644 (file)
@@ -20,9 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-/*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -693,7 +692,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                return;
        }
 
-       ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+       ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
        if (dn->dn_maxblkid == 0 && !add) {
                blkptr_t *bp;
index 99ac625..3a8a5e3 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -194,7 +195,7 @@ dnode_verify(dnode_t *dn)
        ASSERT(dn->dn_objset);
        ASSERT(dn->dn_handle->dnh_dnode == dn);
 
-       ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+       ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
        if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
                return;
@@ -212,7 +213,7 @@ dnode_verify(dnode_t *dn)
                        ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
                }
                ASSERT3U(dn->dn_nlevels, <=, 30);
-               ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+               ASSERT(DMU_OT_IS_VALID(dn->dn_type));
                ASSERT3U(dn->dn_nblkptr, >=, 1);
                ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
                ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
@@ -278,8 +279,10 @@ dnode_byteswap(dnode_phys_t *dnp)
                 */
                int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
                size_t len = DN_MAX_BONUSLEN - off;
-               ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
-               dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+               dmu_object_byteswap_t byteswap;
+               ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+               byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
+               dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
        }
 
        /* Swap SPILL block if we have one */
@@ -407,7 +410,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
-       ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+       ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
        mutex_enter(&os->os_lock);
        list_insert_head(&os->os_dnodes, dn);
@@ -496,11 +499,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
        ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
        ASSERT(ot != DMU_OT_NONE);
-       ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+       ASSERT(DMU_OT_IS_VALID(ot));
        ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
            (bonustype == DMU_OT_SA && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0));
-       ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+       ASSERT(DMU_OT_IS_VALID(bonustype));
        ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT3U(dn->dn_maxblkid, ==, 0);
@@ -568,7 +571,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0) ||
            (bonustype == DMU_OT_SA && bonuslen == 0));
-       ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+       ASSERT(DMU_OT_IS_VALID(bonustype));
        ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
        /* clean up any unreferenced dbufs */
index f2dda86..58fa473 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -598,7 +600,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
        }
 
        if (dn->dn_next_bonustype[txgoff]) {
-               ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+               ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
                dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
                dn->dn_next_bonustype[txgoff] = 0;
        }
index 21fdd08..c5b84a2 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
@@ -35,6 +35,7 @@
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
+#include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
@@ -102,16 +103,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
        if (BP_IS_HOLE(bp))
                return;
        ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
-       ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+       ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
        if (ds == NULL) {
-               /*
-                * Account for the meta-objset space in its placeholder
-                * dsl_dir.
-                */
-               ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
-               dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-                   used, compressed, uncompressed, tx);
-               dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+               dsl_pool_mos_diduse_space(tx->tx_pool,
+                   used, compressed, uncompressed);
                return;
        }
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -119,7 +114,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
        mutex_enter(&ds->ds_dir->dd_lock);
        mutex_enter(&ds->ds_lock);
        delta = parent_delta(ds, used);
-       ds->ds_phys->ds_used_bytes += used;
+       ds->ds_phys->ds_referenced_bytes += used;
        ds->ds_phys->ds_compressed_bytes += compressed;
        ds->ds_phys->ds_uncompressed_bytes += uncompressed;
        ds->ds_phys->ds_unique_bytes += used;
@@ -149,15 +144,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 
        ASSERT(used > 0);
        if (ds == NULL) {
-               /*
-                * Account for the meta-objset space in its placeholder
-                * dataset.
-                */
                dsl_free(tx->tx_pool, tx->tx_txg, bp);
-
-               dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-                   -used, -compressed, -uncompressed, tx);
-               dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+               dsl_pool_mos_diduse_space(tx->tx_pool,
+                   -used, -compressed, -uncompressed);
                return (used);
        }
        ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
@@ -215,8 +204,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
                }
        }
        mutex_enter(&ds->ds_lock);
-       ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
-       ds->ds_phys->ds_used_bytes -= used;
+       ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
+       ds->ds_phys->ds_referenced_bytes -= used;
        ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
        ds->ds_phys->ds_compressed_bytes -= compressed;
        ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
@@ -823,8 +812,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                dsphys->ds_prev_snap_obj = origin->ds_object;
                dsphys->ds_prev_snap_txg =
                    origin->ds_phys->ds_creation_txg;
-               dsphys->ds_used_bytes =
-                   origin->ds_phys->ds_used_bytes;
+               dsphys->ds_referenced_bytes =
+                   origin->ds_phys->ds_referenced_bytes;
                dsphys->ds_compressed_bytes =
                    origin->ds_phys->ds_compressed_bytes;
                dsphys->ds_uncompressed_bytes =
@@ -938,7 +927,6 @@ dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
        for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
            pair = nvlist_next_nvpair(snaps, pair)) {
                dsl_dataset_t *ds;
-               int err;
 
                err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
                if (err == 0) {
@@ -1074,55 +1062,55 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
        dummy_ds->ds_dir = dd;
        dummy_ds->ds_object = ds->ds_object;
 
-       /*
-        * Check for errors and mark this ds as inconsistent, in
-        * case we crash while freeing the objects.
-        */
-       err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
-           dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-       if (err)
-               goto out_free;
+       if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+               /*
+                * Check for errors and mark this ds as inconsistent, in
+                * case we crash while freeing the objects.
+                */
+               err = dsl_sync_task_do(dd->dd_pool,
+                   dsl_dataset_destroy_begin_check,
+                   dsl_dataset_destroy_begin_sync, ds, NULL, 0);
+               if (err)
+                       goto out_free;
 
-       err = dmu_objset_from_ds(ds, &os);
-       if (err)
-               goto out_free;
+               err = dmu_objset_from_ds(ds, &os);
+               if (err)
+                       goto out_free;
 
-       /*
-        * remove the objects in open context, so that we won't
-        * have too much to do in syncing context.
-        */
-       for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
-           ds->ds_phys->ds_prev_snap_txg)) {
                /*
-                * Ignore errors, if there is not enough disk space
-                * we will deal with it in dsl_dataset_destroy_sync().
+                * Remove all objects while in the open context so that
+                * there is less work to do in the syncing context.
                 */
-               (void) dmu_free_object(os, obj);
-       }
-       if (err != ESRCH)
-               goto out_free;
-
-       /*
-        * Only the ZIL knows how to free log blocks.
-        */
-       zil_destroy(dmu_objset_zil(os), B_FALSE);
+               for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
+                   ds->ds_phys->ds_prev_snap_txg)) {
+                       /*
+                        * Ignore errors, if there is not enough disk space
+                        * we will deal with it in dsl_dataset_destroy_sync().
+                        */
+                       (void) dmu_free_object(os, obj);
+               }
+               if (err != ESRCH)
+                       goto out_free;
 
-       /*
-        * Sync out all in-flight IO.
-        */
-       txg_wait_synced(dd->dd_pool, 0);
+               /*
+                * Sync out all in-flight IO.
+                */
+               txg_wait_synced(dd->dd_pool, 0);
 
-       /*
-        * If we managed to free all the objects in open
-        * context, the user space accounting should be zero.
-        */
-       if (ds->ds_phys->ds_bp.blk_fill == 0 &&
-           dmu_objset_userused_enabled(os)) {
-               ASSERTV(uint64_t count);
-               ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
-                   count == 0);
-               ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
-                   count == 0);
+               /*
+                * If we managed to free all the objects in open
+                * context, the user space accounting should be zero.
+                */
+               if (ds->ds_phys->ds_bp.blk_fill == 0 &&
+                   dmu_objset_userused_enabled(os)) {
+                       ASSERTV(uint64_t count);
+
+                       ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
+                           &count) != 0 || count == 0);
+                       ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
+                           &count) != 0 || count == 0);
+               }
        }
 
        rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
@@ -1261,7 +1249,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
        ASSERT(!dsl_dataset_is_snapshot(ds));
 
        if (ds->ds_phys->ds_prev_snap_obj != 0)
-               mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+               mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
        else
                mrs_used = 0;
 
@@ -1269,7 +1257,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 
        ASSERT3U(dlused, <=, mrs_used);
        ds->ds_phys->ds_unique_bytes =
-           ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+           ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
 
        if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
            SPA_VERSION_UNIQUE_ACCURATE)
@@ -1627,12 +1615,36 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
            ds_next->ds_phys->ds_deadlist_obj);
 }
 
+static int
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       int err;
+       struct killarg ka;
+
+       /*
+        * Free everything that we point to (that's born after
+        * the previous snapshot, if we are a clone)
+        *
+        * NB: this should be very quick, because we already
+        * freed all the objects in open context.
+        */
+       ka.ds = ds;
+       ka.tx = tx;
+       err = traverse_dataset(ds,
+           ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+           kill_blkptr, &ka);
+       ASSERT3U(err, ==, 0);
+       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+
+       return (err);
+}
+
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
        struct dsl_ds_destroyarg *dsda = arg1;
        dsl_dataset_t *ds = dsda->ds;
-       int err;
+       int err = 0;
        int after_branch_point = FALSE;
        dsl_pool_t *dp = ds->ds_dir->dd_pool;
        objset_t *mos = dp->dp_meta_objset;
@@ -1773,7 +1785,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
                            tx);
                        dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
                            DD_USED_HEAD, used, comp, uncomp, tx);
-                       dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
 
                        /* Merge our deadlist into next's and free it. */
                        dsl_deadlist_merge(&ds_next->ds_deadlist,
@@ -1849,32 +1860,57 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
                }
                dsl_dataset_rele(ds_next, FTAG);
        } else {
+               zfeature_info_t *async_destroy =
+                   &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
+               objset_t *os;
+
                /*
                 * There's no next snapshot, so this is a head dataset.
                 * Destroy the deadlist.  Unless it's a clone, the
                 * deadlist should be empty.  (If it's a clone, it's
                 * safe to ignore the deadlist contents.)
                 */
-               struct killarg ka;
-
                dsl_deadlist_close(&ds->ds_deadlist);
                dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
                ds->ds_phys->ds_deadlist_obj = 0;
 
-               /*
-                * Free everything that we point to (that's born after
-                * the previous snapshot, if we are a clone)
-                *
-                * NB: this should be very quick, because we already
-                * freed all the objects in open context.
-                */
-               ka.ds = ds;
-               ka.tx = tx;
-               err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-                   TRAVERSE_POST, kill_blkptr, &ka);
-               ASSERT3U(err, ==, 0);
-               ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-                   ds->ds_phys->ds_unique_bytes == 0);
+               VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
+
+               if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+                       err = old_synchronous_dataset_destroy(ds, tx);
+               } else {
+                       /*
+                        * Move the bptree into the pool's list of trees to
+                        * clean up and update space accounting information.
+                        */
+                       uint64_t used, comp, uncomp;
+
+                       zil_destroy_sync(dmu_objset_zil(os), tx);
+
+                       if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+                               spa_feature_incr(dp->dp_spa, async_destroy, tx);
+                               dp->dp_bptree_obj = bptree_alloc(mos, tx);
+                               VERIFY(zap_add(mos,
+                                   DMU_POOL_DIRECTORY_OBJECT,
+                                   DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+                                   &dp->dp_bptree_obj, tx) == 0);
+                       }
+
+                       used = ds->ds_dir->dd_phys->dd_used_bytes;
+                       comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
+                       uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+
+                       ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+                           ds->ds_phys->ds_unique_bytes == used);
+
+                       bptree_add(mos, dp->dp_bptree_obj,
+                           &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+                           used, comp, uncomp, tx);
+                       dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+                           -used, -comp, -uncomp, tx);
+                       dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+                           used, comp, uncomp, tx);
+               }
 
                if (ds->ds_prev != NULL) {
                        if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
@@ -2065,7 +2101,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
        dsphys->ds_creation_time = gethrestime_sec();
        dsphys->ds_creation_txg = crtxg;
        dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-       dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+       dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
        dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
        dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
        dsphys->ds_flags = ds->ds_phys->ds_flags;
@@ -2154,7 +2190,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
        dmu_buf_will_dirty(ds->ds_dbuf, tx);
        ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
 
-       dsl_dir_dirty(ds->ds_dir, tx);
        dmu_objset_sync(ds->ds_objset, zio, tx);
 }
 
@@ -2189,10 +2224,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
            zap_cursor_advance(&zc)) {
                dsl_dataset_t *clone;
                char buf[ZFS_MAXNAMELEN];
+               /*
+                * Even though we hold the dp_config_rwlock, the dataset
+                * may fail to open, returning ENOENT.  If there is a
+                * thread concurrently attempting to destroy this
+                * dataset, it will have the ds_rwlock held for
+                * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
+                * dsl_dataset_hold_ref() will fail its
+                * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
+                * dp_config_rwlock, and wait for the destroy progress
+                * and signal ds_exclusive_cv.  If the destroy was
+                * successful, we will see that
+                * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
+                */
                if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-                   za.za_first_integer, FTAG, &clone) != 0) {
-                       goto fail;
-               }
+                   za.za_first_integer, FTAG, &clone) != 0)
+                       continue;
                dsl_dir_name(clone->ds_dir, buf);
                VERIFY(nvlist_add_boolean(val, buf) == 0);
                dsl_dataset_rele(clone, FTAG);
@@ -2316,7 +2363,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-       *refdbytesp = ds->ds_phys->ds_used_bytes;
+       *refdbytesp = ds->ds_phys->ds_referenced_bytes;
        *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
        if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
                *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
@@ -2652,7 +2699,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
         * Note however, if we stop before we reach the ORIGIN we get:
         * uN + kN + kN-1 + ... + kM - uM-1
         */
-       pa->used = origin_ds->ds_phys->ds_used_bytes;
+       pa->used = origin_ds->ds_phys->ds_referenced_bytes;
        pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
        pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
        for (snap = list_head(&pa->shared_snaps); snap;
@@ -2686,7 +2733,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
         * so we need to subtract out the clone origin's used space.
         */
        if (pa->origin_origin) {
-               pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
+               pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
                pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
                pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
        }
@@ -3203,8 +3250,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                dsl_deadlist_space(&csa->ohds->ds_deadlist,
                    &odl_used, &odl_comp, &odl_uncomp);
 
-               dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
-                   (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+               dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
+                   (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
                dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
                    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
                duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
@@ -3233,8 +3280,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
        }
 
        /* swap ds_*_bytes */
-       SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
-           csa->cds->ds_phys->ds_used_bytes);
+       SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
+           csa->cds->ds_phys->ds_referenced_bytes);
        SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
            csa->cds->ds_phys->ds_compressed_bytes);
        SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
@@ -3363,8 +3410,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
         * on-disk is over quota and there are no pending changes (which
         * may free up space for us).
         */
-       if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
-               if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+       if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+               if (inflight > 0 ||
+                   ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
                        error = ERESTART;
                else
                        error = EDQUOT;
@@ -3393,7 +3441,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (psa->psa_effective_value == 0)
                return (0);
 
-       if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+       if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
            psa->psa_effective_value < ds->ds_reserved)
                return (ENOSPC);
 
@@ -4141,8 +4189,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
        dsl_pool_t *dp = new->ds_dir->dd_pool;
 
        *usedp = 0;
-       *usedp += new->ds_phys->ds_used_bytes;
-       *usedp -= oldsnap->ds_phys->ds_used_bytes;
+       *usedp += new->ds_phys->ds_referenced_bytes;
+       *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
 
        *compp = 0;
        *compp += new->ds_phys->ds_compressed_bytes;
@@ -4158,9 +4206,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
                dsl_dataset_t *snap;
                uint64_t used, comp, uncomp;
 
-               err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
-               if (err != 0)
-                       break;
+               if (snapobj == new->ds_object) {
+                       snap = new;
+               } else {
+                       err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+                       if (err != 0)
+                               break;
+               }
 
                if (snap->ds_phys->ds_prev_snap_txg ==
                    oldsnap->ds_phys->ds_creation_txg) {
@@ -4189,7 +4241,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
                 * was not a snapshot of/before new.
                 */
                snapobj = snap->ds_phys->ds_prev_snap_obj;
-               dsl_dataset_rele(snap, FTAG);
+               if (snap != new)
+                       dsl_dataset_rele(snap, FTAG);
                if (snapobj == 0) {
                        err = EINVAL;
                        break;
index 1e89a68..909b5f8 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_dataset.h>
@@ -165,12 +165,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 
        for (zap_cursor_init(&zc, os, dlobj);
            zap_cursor_retrieve(&zc, &za) == 0;
-           zap_cursor_advance(&zc))
-               bpobj_free(os, za.za_first_integer, tx);
+           zap_cursor_advance(&zc)) {
+               uint64_t obj = za.za_first_integer;
+               if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+                       bpobj_decr_empty(os, tx);
+               else
+                       bpobj_free(os, obj, tx);
+       }
        zap_cursor_fini(&zc);
        VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 }
 
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    const blkptr_t *bp, dmu_tx_t *tx)
+{
+       if (dle->dle_bpobj.bpo_object ==
+           dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               bpobj_close(&dle->dle_bpobj);
+               bpobj_decr_empty(dl->dl_os, tx);
+               VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+               VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+                   dle->dle_mintxg, obj, tx));
+       }
+       bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj, dmu_tx_t *tx)
+{
+       if (dle->dle_bpobj.bpo_object !=
+           dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+               bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+       } else {
+               bpobj_close(&dle->dle_bpobj);
+               bpobj_decr_empty(dl->dl_os, tx);
+               VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+               VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+                   dle->dle_mintxg, obj, tx));
+       }
+}
+
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -199,7 +236,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
                dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
        else
                dle = AVL_PREV(&dl->dl_tree, dle);
-       bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+       dle_enqueue(dl, dle, bp, tx);
 }
 
 /*
@@ -219,7 +256,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 
        dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE);
        dle->dle_mintxg = mintxg;
-       obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+       obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
        VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
        avl_add(&dl->dl_tree, dle);
 
@@ -245,8 +282,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
        dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
        dle_prev = AVL_PREV(&dl->dl_tree, dle);
 
-       bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
-           dle->dle_bpobj.bpo_object, tx);
+       dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
        avl_remove(&dl->dl_tree, dle);
        bpobj_close(&dle->dle_bpobj);
@@ -304,7 +340,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
                if (dle->dle_mintxg >= maxtxg)
                        break;
 
-               obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
                VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
                    dle->dle_mintxg, obj, tx));
        }
@@ -402,7 +438,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
        dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
        if (dle == NULL)
                dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
-       bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+       dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 static int
index a4d4e42..294932c 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -171,10 +171,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
 
                if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
-                       jumpobj = zap_create(mos, DMU_OT_DSL_PERMS,
-                           DMU_OT_NONE, 0, tx);
-                       VERIFY(zap_update(mos, zapobj,
-                           whokey, 8, 1, &jumpobj, tx) == 0);
+                       jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
+                           zapobj, whokey, tx);
                }
 
                while ((permpair = nvlist_next_nvpair(perms, permpair))) {
index 377df40..7412239 100644 (file)
@@ -189,7 +189,6 @@ errout:
        kmem_free(dd, sizeof (dsl_dir_t));
        dmu_buf_rele(dbuf, tag);
        return (err);
-
 }
 
 void
@@ -223,7 +222,7 @@ dsl_dir_name(dsl_dir_t *dd, char *buf)
        }
 }
 
-/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
+/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 int
 dsl_dir_namelen(dsl_dir_t *dd)
 {
@@ -592,8 +591,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 {
        ASSERT(dmu_tx_is_syncing(tx));
 
-       dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
        mutex_enter(&dd->dd_lock);
        ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
        dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
@@ -950,8 +947,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
        ASSERT(dmu_tx_is_syncing(tx));
        ASSERT(type < DD_USED_NUM);
 
-       dsl_dir_dirty(dd, tx);
-
        if (needlock)
                mutex_enter(&dd->dd_lock);
        accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
@@ -960,6 +955,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
            dd->dd_phys->dd_compressed_bytes >= -compressed);
        ASSERT(uncompressed >= 0 ||
            dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+       dmu_buf_will_dirty(dd->dd_dbuf, tx);
        dd->dd_phys->dd_used_bytes += used;
        dd->dd_phys->dd_uncompressed_bytes += uncompressed;
        dd->dd_phys->dd_compressed_bytes += compressed;
@@ -1003,13 +999,13 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
        if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
                return;
 
-       dsl_dir_dirty(dd, tx);
        if (needlock)
                mutex_enter(&dd->dd_lock);
        ASSERT(delta > 0 ?
            dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
            dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
        ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
+       dmu_buf_will_dirty(dd->dd_dbuf, tx);
        dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
        dd->dd_phys->dd_used_breakdown[newtype] += delta;
        if (needlock)
index 7e0fba5..704f034 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
@@ -40,6 +40,9 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/bptree.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;                 /* 1/8th of physical memory */
@@ -222,12 +225,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 
        txg_list_create(&dp->dp_dirty_datasets,
            offsetof(dsl_dataset_t, ds_dirty_link));
+       txg_list_create(&dp->dp_dirty_zilogs,
+           offsetof(zilog_t, zl_dirty_link));
        txg_list_create(&dp->dp_dirty_dirs,
            offsetof(dsl_dir_t, dd_dirty_link));
        txg_list_create(&dp->dp_sync_tasks,
            offsetof(dsl_sync_task_group_t, dstg_node));
-       list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
-           offsetof(dsl_dataset_t, ds_synced_link));
 
        mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -240,20 +243,30 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 }
 
 int
-dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
        int err;
        dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+       err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+           &dp->dp_meta_objset);
+       if (err != 0)
+               dsl_pool_close(dp);
+       else
+               *dpp = dp;
+
+       return (err);
+}
+
+int
+dsl_pool_open(dsl_pool_t *dp)
+{
+       int err;
        dsl_dir_t *dd;
        dsl_dataset_t *ds;
        uint64_t obj;
 
        rw_enter(&dp->dp_config_rwlock, RW_WRITER);
-       err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
-           &dp->dp_meta_objset);
-       if (err)
-               goto out;
-
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
            &dp->dp_root_dir_obj);
@@ -269,7 +282,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
        if (err)
                goto out;
 
-       if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
+       if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
                err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
                if (err)
                        goto out;
@@ -286,7 +299,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
                        goto out;
        }
 
-       if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+       if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
                    &dp->dp_free_dir);
                if (err)
@@ -300,6 +313,24 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
                    dp->dp_meta_objset, obj));
        }
 
+       if (spa_feature_is_active(dp->dp_spa,
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+                   &dp->dp_bptree_obj);
+               if (err != 0)
+                       goto out;
+       }
+
+       if (spa_feature_is_active(dp->dp_spa,
+           &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+                   &dp->dp_empty_bpobj);
+               if (err != 0)
+                       goto out;
+       }
+
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
            &dp->dp_tmp_userrefs_obj);
@@ -308,15 +339,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
        if (err)
                goto out;
 
-       err = dsl_scan_init(dp, txg);
+       err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
        rw_exit(&dp->dp_config_rwlock);
-       if (err)
-               dsl_pool_close(dp);
-       else
-               *dpp = dp;
-
        return (err);
 }
 
@@ -346,9 +372,9 @@ dsl_pool_close(dsl_pool_t *dp)
                dmu_objset_evict(dp->dp_meta_objset);
 
        txg_list_destroy(&dp->dp_dirty_datasets);
+       txg_list_destroy(&dp->dp_dirty_zilogs);
        txg_list_destroy(&dp->dp_sync_tasks);
        txg_list_destroy(&dp->dp_dirty_dirs);
-       list_destroy(&dp->dp_synced_datasets);
 
        arc_flush(dp->dp_spa);
        txg_fini(dp);
@@ -429,6 +455,21 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
        return (dp);
 }
 
+/*
+ * Account for the meta-objset space in its placeholder dsl_dir.
+ */
+void
+dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp)
+{
+       ASSERT3U(comp, ==, uncomp); /* it's all metadata */
+       mutex_enter(&dp->dp_lock);
+       dp->dp_mos_used_delta += used;
+       dp->dp_mos_compressed_delta += comp;
+       dp->dp_mos_uncompressed_delta += uncomp;
+       mutex_exit(&dp->dp_lock);
+}
+
 static int
 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -447,11 +488,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
        dmu_tx_t *tx;
        dsl_dir_t *dd;
        dsl_dataset_t *ds;
-       dsl_sync_task_group_t *dstg;
        objset_t *mos = dp->dp_meta_objset;
        hrtime_t start, write_time;
        uint64_t data_written;
        int err;
+       list_t synced_datasets;
+
+       list_create(&synced_datasets, sizeof (dsl_dataset_t),
+           offsetof(dsl_dataset_t, ds_synced_link));
 
        /*
         * We need to copy dp_space_towrite() before doing
@@ -474,7 +518,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                 * may sync newly-created datasets on pass 2.
                 */
                ASSERT(!list_link_active(&ds->ds_synced_link));
-               list_insert_tail(&dp->dp_synced_datasets, ds);
+               list_insert_tail(&synced_datasets, ds);
                dsl_dataset_sync(ds, zio, tx);
        }
        DTRACE_PROBE(pool_sync__1setup);
@@ -484,15 +528,20 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
        ASSERT(err == 0);
        DTRACE_PROBE(pool_sync__2rootzio);
 
-       for (ds = list_head(&dp->dp_synced_datasets); ds;
-           ds = list_next(&dp->dp_synced_datasets, ds))
+       /*
+        * After the data blocks have been written (ensured by the zio_wait()
+        * above), update the user/group space accounting.
+        */
+       for (ds = list_head(&synced_datasets); ds;
+           ds = list_next(&synced_datasets, ds))
                dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 
        /*
         * Sync the datasets again to push out the changes due to
         * userspace updates.  This must be done before we process the
-        * sync tasks, because that could cause a snapshot of a dataset
-        * whose ds_bp will be rewritten when we do this 2nd sync.
+        * sync tasks, so that any snapshots will have the correct
+        * user accounting information (and we won't get confused
+        * about which blocks are part of the snapshot).
         */
        zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
        while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) {
@@ -503,30 +552,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
        err = zio_wait(zio);
 
        /*
-        * Move dead blocks from the pending deadlist to the on-disk
-        * deadlist.
+        * Now that the datasets have been completely synced, we can
+        * clean up our in-memory structures accumulated while syncing:
+        *
+        *  - move dead blocks from the pending deadlist to the on-disk deadlist
+        *  - clean up zil records
+        *  - release hold from dsl_dataset_dirty()
         */
-       for (ds = list_head(&dp->dp_synced_datasets); ds;
-           ds = list_next(&dp->dp_synced_datasets, ds)) {
+       while ((ds = list_remove_head(&synced_datasets))) {
+               ASSERTV(objset_t *os = ds->ds_objset);
                bplist_iterate(&ds->ds_pending_deadlist,
                    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+               ASSERT(!dmu_objset_is_dirty(os, txg));
+               dmu_buf_rele(ds->ds_dbuf, ds);
        }
 
-       while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) {
-               /*
-                * No more sync tasks should have been added while we
-                * were syncing.
-                */
-               ASSERT(spa_sync_pass(dp->dp_spa) == 1);
-               dsl_sync_task_group_sync(dstg, tx);
-       }
-       DTRACE_PROBE(pool_sync__3task);
-
        start = gethrtime();
        while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)))
                dsl_dir_sync(dd, tx);
        write_time += gethrtime() - start;
 
+       /*
+        * The MOS's space is accounted for in the pool/$MOS
+        * (dp_mos_dir).  We can't modify the mos while we're syncing
+        * it, so we remember the deltas and apply them here.
+        */
+       if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
+           dp->dp_mos_uncompressed_delta != 0) {
+               dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
+                   dp->dp_mos_used_delta,
+                   dp->dp_mos_compressed_delta,
+                   dp->dp_mos_uncompressed_delta, tx);
+               dp->dp_mos_used_delta = 0;
+               dp->dp_mos_compressed_delta = 0;
+               dp->dp_mos_uncompressed_delta = 0;
+       }
+
        start = gethrtime();
        if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
            list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
@@ -542,6 +603,27 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
            hrtime_t, dp->dp_read_overhead);
        write_time -= dp->dp_read_overhead;
 
+       /*
+        * If we modify a dataset in the same txg that we want to destroy it,
+        * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
+        * dsl_dir_destroy_check() will fail if there are unexpected holds.
+        * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
+        * and clearing the hold on it) before we process the sync_tasks.
+        * The MOS data dirtied by the sync_tasks will be synced on the next
+        * pass.
+        */
+       DTRACE_PROBE(pool_sync__3task);
+       if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
+               dsl_sync_task_group_t *dstg;
+               /*
+                * No more sync tasks should have been added while we
+                * were syncing.
+                */
+               ASSERT(spa_sync_pass(dp->dp_spa) == 1);
+               while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg)))
+                       dsl_sync_task_group_sync(dstg, tx);
+       }
+
        dmu_tx_commit(tx);
 
        dp->dp_space_towrite[txg & TXG_MASK] = 0;
@@ -590,15 +672,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
+       zilog_t *zilog;
        dsl_dataset_t *ds;
-       objset_t *os;
 
-       while ((ds = list_head(&dp->dp_synced_datasets))) {
-               list_remove(&dp->dp_synced_datasets, ds);
-               os = ds->ds_objset;
-               zil_clean(os->os_zil, txg);
-               ASSERT(!dmu_objset_is_dirty(os, txg));
-               dmu_buf_rele(ds->ds_dbuf, ds);
+       while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
+               ds = dmu_objset_ds(zilog->zl_os);
+               zil_clean(zilog, txg);
+               ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
+               dmu_buf_rele(ds->ds_dbuf, zilog);
        }
        ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
@@ -611,7 +692,7 @@ int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
        return (curthread == dp->dp_tx.tx_sync_thread ||
-           spa_get_dsl(dp->dp_spa) == NULL);
+           spa_is_initializing(dp->dp_spa));
 }
 
 uint64_t
@@ -932,11 +1013,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
        ASSERT(dp->dp_tmp_userrefs_obj == 0);
        ASSERT(dmu_tx_is_syncing(tx));
 
-       dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
-           DMU_OT_NONE, 0, tx);
-
-       VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
-           sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+       dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
+           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
index c2386dd..297caa0 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_scan.h>
@@ -44,6 +45,7 @@
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
@@ -379,55 +381,6 @@ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
            priority, zio_flags, arc_flags, zb));
 }
 
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
-       return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-           zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-       uint64_t zb1nextL0, zb2thisobj;
-
-       ASSERT(zb1->zb_objset == zb2->zb_objset);
-       ASSERT(zb2->zb_level == 0);
-
-       /*
-        * A bookmark in the deadlist is considered to be after
-        * everything else.
-        */
-       if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-               return (B_TRUE);
-
-       /* The objset_phys_t isn't before anything. */
-       if (dnp == NULL)
-               return (B_FALSE);
-
-       zb1nextL0 = (zb1->zb_blkid + 1) <<
-           ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
-       zb2thisobj = zb2->zb_object ? zb2->zb_object :
-           zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
-       if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-               uint64_t nextobj = zb1nextL0 *
-                   (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-               return (nextobj <= zb2thisobj);
-       }
-
-       if (zb1->zb_object < zb2thisobj)
-               return (B_TRUE);
-       if (zb1->zb_object > zb2thisobj)
-               return (B_FALSE);
-       if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-               return (B_FALSE);
-       return (zb1nextL0 <= zb2->zb_blkid);
-}
-
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
@@ -459,7 +412,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
        if (scn->scn_pausing)
                return (B_TRUE); /* we're already pausing */
 
-       if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+       if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
                return (B_FALSE); /* we're resuming */
 
        /* We only know how to resume from level-0 blocks. */
@@ -614,13 +567,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
        /*
         * We never skip over user/group accounting objects (obj<0)
         */
-       if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+       if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
            (int64_t)zb->zb_object >= 0) {
                /*
                 * If we already visited this bp & everything below (in
                 * a prior txg sync), don't bother doing it again.
                 */
-               if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+               if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
                        return (B_TRUE);
 
                /*
@@ -823,22 +776,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
        if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
                goto out;
 
-       if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
-               /*
-                * For non-user-accounting blocks, we need to read the
-                * new bp (from a deleted snapshot, found in
-                * check_existing_xlation).  If we used the old bp,
-                * pointers inside this block from before we resumed
-                * would be untranslated.
-                *
-                * For user-accounting blocks, we need to read the old
-                * bp, because we will apply the entire space delta to
-                * it (original untranslated -> translations from
-                * deleted snap -> now).
-                */
-               *bp_toread = *bp;
-       }
-
        if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx,
            &buf) != 0)
                goto out;
@@ -1414,19 +1351,28 @@ out:
        kmem_free(zc, sizeof(zap_cursor_t));
 }
 
-static int
-dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static boolean_t
+dsl_scan_free_should_pause(dsl_scan_t *scn)
 {
-       dsl_scan_t *scn = arg;
        uint64_t elapsed_nanosecs;
 
        elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-
-       if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+       return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
            (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
            txg_sync_waiting(scn->scn_dp)) ||
-           spa_shutting_down(scn->scn_dp->dp_spa))
-               return (ERESTART);
+           spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg;
+
+       if (!scn->scn_is_bptree ||
+           (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+               if (dsl_scan_free_should_pause(scn))
+                       return (ERESTART);
+       }
 
        zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
            dmu_tx_get_txg(tx), bp, 0));
@@ -1451,6 +1397,10 @@ dsl_scan_active(dsl_scan_t *scn)
        if (scn->scn_phys.scn_state == DSS_SCANNING)
                return (B_TRUE);
 
+       if (spa_feature_is_active(spa,
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+               return (B_TRUE);
+       }
        if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
                    &used, &comp, &uncomp);
@@ -1497,14 +1447,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
         * traversing it.
         */
        if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+               scn->scn_is_bptree = B_FALSE;
                scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
                    NULL, ZIO_FLAG_MUSTSUCCEED);
                err = bpobj_iterate(&dp->dp_free_bpobj,
-                   dsl_scan_free_cb, scn, tx);
+                   dsl_scan_free_block_cb, scn, tx);
                VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+               if (err == 0 && spa_feature_is_active(spa,
+                   &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                       scn->scn_is_bptree = B_TRUE;
+                       scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                           NULL, ZIO_FLAG_MUSTSUCCEED);
+                       err = bptree_iterate(dp->dp_meta_objset,
+                           dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
+                           scn, tx);
+                       VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+                       if (err != 0)
+                               return;
+
+                       /* disable async destroy feature */
+                       spa_feature_decr(spa,
+                           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
+                       ASSERT(!spa_feature_is_active(spa,
+                           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
+                       VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+                           DMU_POOL_DIRECTORY_OBJECT,
+                           DMU_POOL_BPTREE_OBJ, tx));
+                       VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
+                           dp->dp_bptree_obj, tx));
+                       dp->dp_bptree_obj = 0;
+               }
                if (scn->scn_visited_this_txg) {
                        zfs_dbgmsg("freed %llu blocks in %llums from "
-                           "free_bpobj txg %llu",
+                           "free_bpobj/bptree txg %llu",
                            (longlong_t)scn->scn_visited_this_txg,
                            (longlong_t)
                            (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
@@ -1619,9 +1595,13 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
        for (i = 0; i < 4; i++) {
                int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
                int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
-               zfs_blkstat_t *zb = &zab->zab_type[l][t];
                int equal;
+               zfs_blkstat_t *zb;
+
+               if (t & DMU_OT_NEWTYPE)
+                       t = DMU_OT_OTHER;
 
+               zb = &zab->zab_type[l][t];
                zb->zb_count++;
                zb->zb_asize += BP_GET_ASIZE(bp);
                zb->zb_lsize += BP_GET_LSIZE(bp);
index d4b28cc..240a683 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -446,10 +448,9 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
                char attr_name[8];
 
                if (sa->sa_layout_attr_obj == 0) {
-                       sa->sa_layout_attr_obj = zap_create(os,
-                           DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
-                       VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
-                           &sa->sa_layout_attr_obj, tx) == 0);
+                       sa->sa_layout_attr_obj = zap_create_link(os,
+                           DMU_OT_SA_ATTR_LAYOUTS,
+                           sa->sa_master_obj, SA_LAYOUTS, tx);
                }
 
                (void) snprintf(attr_name, sizeof (attr_name),
@@ -1583,10 +1584,9 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
        }
 
        if (sa->sa_reg_attr_obj == 0) {
-               sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
-                   DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
-               VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
-                   SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
+               sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+                   DMU_OT_SA_ATTR_REGISTRATION,
+                   sa->sa_master_obj, SA_REGISTRY, tx);
        }
        for (i = 0; i != sa->sa_num_attrs; i++) {
                if (sa->sa_attr_table[i].sa_registered)
index b610a0d..5b6465f 100644 (file)
@@ -63,6 +63,7 @@
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
+#include <sys/zfeature.h>
 
 #ifdef _KERNEL
 #include <sys/bootprops.h>
@@ -114,7 +115,10 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 };
 
+static dsl_syncfunc_t spa_sync_version;
 static dsl_syncfunc_t spa_sync_props;
+static dsl_checkfunc_t spa_change_guid_check;
+static dsl_syncfunc_t spa_change_guid_sync;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -169,6 +173,7 @@ static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
        vdev_t *rvd = spa->spa_root_vdev;
+       dsl_pool_t *pool = spa->spa_dsl_pool;
        uint64_t size;
        uint64_t alloc;
        uint64_t space;
@@ -216,6 +221,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
        }
 
+       if (pool != NULL) {
+               dsl_dir_t *freedir = pool->dp_free_dir;
+
+               /*
+                * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
+                * when opening pools before this version freedir will be NULL.
+                */
+               if (freedir != NULL) {
+                       spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+                           freedir->dd_phys->dd_used_bytes, src);
+               } else {
+                       spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+                           NULL, 0, src);
+               }
+       }
+
        spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
        if (spa->spa_comment != NULL) {
@@ -357,25 +378,55 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
        nvpair_t *elem;
        int error = 0, reset_bootfs = 0;
        uint64_t objnum = 0;
+       boolean_t has_feature = B_FALSE;
 
        elem = NULL;
        while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-               zpool_prop_t prop;
-               char *propname, *strval;
                uint64_t intval;
-               objset_t *os;
-               char *slash, *check;
+               char *strval, *slash, *check, *fname;
+               const char *propname = nvpair_name(elem);
+               zpool_prop_t prop = zpool_name_to_prop(propname);
+
+               switch ((int)prop) {
+               case ZPROP_INVAL:
+                       if (!zpool_prop_feature(propname)) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       /*
+                        * Sanitize the input.
+                        */
+                       if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
 
-               propname = nvpair_name(elem);
+                       if (intval != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       fname = strchr(propname, '@') + 1;
+                       if (zfeature_lookup_name(fname, NULL) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
 
-               if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
-                       return (EINVAL);
+                       has_feature = B_TRUE;
+                       break;
 
-               switch (prop) {
                case ZPOOL_PROP_VERSION:
                        error = nvpair_value_uint64(elem, &intval);
                        if (!error &&
-                           (intval < spa_version(spa) || intval > SPA_VERSION))
+                           (intval < spa_version(spa) ||
+                           intval > SPA_VERSION_BEFORE_FEATURES ||
+                           has_feature))
                                error = EINVAL;
                        break;
 
@@ -412,6 +463,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                        error = nvpair_value_string(elem, &strval);
 
                        if (!error) {
+                               objset_t *os;
                                uint64_t compress;
 
                                if (strval == NULL || strval[0] == '\0') {
@@ -558,33 +610,58 @@ int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
        int error;
-       nvpair_t *elem;
+       nvpair_t *elem = NULL;
        boolean_t need_sync = B_FALSE;
-       zpool_prop_t prop;
 
        if ((error = spa_prop_validate(spa, nvp)) != 0)
                return (error);
 
-       elem = NULL;
        while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
-               if ((prop = zpool_name_to_prop(
-                   nvpair_name(elem))) == ZPROP_INVAL)
-                       return (EINVAL);
+               zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
                if (prop == ZPOOL_PROP_CACHEFILE ||
                    prop == ZPOOL_PROP_ALTROOT ||
                    prop == ZPOOL_PROP_READONLY)
                        continue;
 
+               if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
+                       uint64_t ver;
+
+                       if (prop == ZPOOL_PROP_VERSION) {
+                               VERIFY(nvpair_value_uint64(elem, &ver) == 0);
+                       } else {
+                               ASSERT(zpool_prop_feature(nvpair_name(elem)));
+                               ver = SPA_VERSION_FEATURES;
+                               need_sync = B_TRUE;
+                       }
+
+                       /* Save time if the version is already set. */
+                       if (ver == spa_version(spa))
+                               continue;
+
+                       /*
+                        * In addition to the pool directory object, we might
+                        * create the pool properties object, the features for
+                        * read object, the features for write object, or the
+                        * feature descriptions object.
+                        */
+                       error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
+                           spa_sync_version, spa, &ver, 6);
+                       if (error)
+                               return (error);
+                       continue;
+               }
+
                need_sync = B_TRUE;
                break;
        }
 
-       if (need_sync)
+       if (need_sync) {
                return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-                   spa, nvp, 3));
-       else
-               return (0);
+                   spa, nvp, 6));
+       }
+
+       return (0);
 }
 
 /*
@@ -601,6 +678,47 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
        }
 }
 
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t vdev_state;
+       ASSERTV(uint64_t *newguid = arg2);
+
+       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+       vdev_state = rvd->vdev_state;
+       spa_config_exit(spa, SCL_STATE, FTAG);
+
+       if (vdev_state != VDEV_STATE_HEALTHY)
+               return (ENXIO);
+
+       ASSERT3U(spa_guid(spa), !=, *newguid);
+
+       return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       uint64_t *newguid = arg2;
+       uint64_t oldguid;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       oldguid = spa_guid(spa);
+
+       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+       rvd->vdev_guid = *newguid;
+       rvd->vdev_guid_sum += (*newguid - oldguid);
+       vdev_config_dirty(rvd);
+       spa_config_exit(spa, SCL_STATE, FTAG);
+
+       spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx,
+           "old=%lld new=%lld", oldguid, *newguid);
+}
+
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
@@ -613,29 +731,23 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 int
 spa_change_guid(spa_t *spa)
 {
-       uint64_t        oldguid, newguid;
-       uint64_t        txg;
-
-       if (!(spa_mode_global & FWRITE))
-               return (EROFS);
-
-       txg = spa_vdev_enter(spa);
-
-       if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
-               return (spa_vdev_exit(spa, NULL, txg, ENXIO));
+       int error;
+       uint64_t guid;
 
-       oldguid = spa_guid(spa);
-       newguid = spa_generate_guid(NULL);
-       ASSERT3U(oldguid, !=, newguid);
+       mutex_enter(&spa_namespace_lock);
+       guid = spa_generate_guid(NULL);
 
-       spa->spa_root_vdev->vdev_guid = newguid;
-       spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
+       error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
+           spa_change_guid_sync, spa, &guid, 5);
 
-       vdev_config_dirty(spa->spa_root_vdev);
+       if (error == 0) {
+               spa_config_sync(spa, B_FALSE, B_TRUE);
+               spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
+       }
 
-       spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
+       mutex_exit(&spa_namespace_lock);
 
-       return (spa_vdev_exit(spa, NULL, txg, 0));
+       return (error);
 }
 
 /*
@@ -1628,7 +1740,7 @@ spa_load_verify_done(zio_t *zio)
        int error = zio->io_error;
 
        if (error) {
-               if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+               if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
                    type != DMU_OT_INTENT_LOG)
                        atomic_add_64(&sle->sle_meta_count, 1);
                else
@@ -1858,6 +1970,9 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
                            KM_PUSHPAGE) == 0);
                }
 
+               nvlist_free(spa->spa_load_info);
+               spa->spa_load_info = fnvlist_alloc();
+
                gethrestime(&spa->spa_loaded_ts);
                error = spa_load_impl(spa, pool_guid, config, state, type,
                    mosconfig, &ereport);
@@ -1891,12 +2006,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 {
        int error = 0;
        nvlist_t *nvroot = NULL;
+       nvlist_t *label;
        vdev_t *rvd;
        uberblock_t *ub = &spa->spa_uberblock;
        uint64_t children, config_cache_txg = spa->spa_config_txg;
        int orig_mode = spa->spa_mode;
        int parse;
        uint64_t obj;
+       boolean_t missing_feat_write = B_FALSE;
 
        /*
         * If this is an untrusted config, access the pool in read-only mode.
@@ -1976,19 +2093,79 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
        /*
         * Find the best uberblock.
         */
-       vdev_uberblock_load(NULL, rvd, ub);
+       vdev_uberblock_load(rvd, ub, &label);
 
        /*
         * If we weren't able to find a single valid uberblock, return failure.
         */
-       if (ub->ub_txg == 0)
+       if (ub->ub_txg == 0) {
+               nvlist_free(label);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+       }
 
        /*
-        * If the pool is newer than the code, we can't open it.
+        * If the pool has an unsupported version we can't open it.
         */
-       if (ub->ub_version > SPA_VERSION)
+       if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+               nvlist_free(label);
                return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+       }
+
+       if (ub->ub_version >= SPA_VERSION_FEATURES) {
+               nvlist_t *features;
+
+               /*
+                * If we weren't able to find what's necessary for reading the
+                * MOS in the label, return failure.
+                */
+               if (label == NULL || nvlist_lookup_nvlist(label,
+                   ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
+                       nvlist_free(label);
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+                           ENXIO));
+               }
+
+               /*
+                * Update our in-core representation with the definitive values
+                * from the label.
+                */
+               nvlist_free(spa->spa_label_features);
+               VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+       }
+
+       nvlist_free(label);
+
+       /*
+        * Look through entries in the label nvlist's features_for_read. If
+        * there is a feature listed there which we don't understand then we
+        * cannot open a pool.
+        */
+       if (ub->ub_version >= SPA_VERSION_FEATURES) {
+               nvlist_t *unsup_feat;
+               nvpair_t *nvp;
+
+               VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+                   0);
+
+               for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL);
+                   nvp != NULL;
+                   nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+                       if (!zfeature_is_supported(nvpair_name(nvp))) {
+                               VERIFY(nvlist_add_string(unsup_feat,
+                                   nvpair_name(nvp), "") == 0);
+                       }
+               }
+
+               if (!nvlist_empty(unsup_feat)) {
+                       VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+                           ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+                       nvlist_free(unsup_feat);
+                       return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+                           ENOTSUP));
+               }
+
+               nvlist_free(unsup_feat);
+       }
 
        /*
         * If the vdev guid sum doesn't match the uberblock, we have an
@@ -2022,7 +2199,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
        spa->spa_claim_max_txg = spa->spa_first_txg;
        spa->spa_prev_software_version = ub->ub_software_version;
 
-       error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+       error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
        if (error)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
        spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
@@ -2030,6 +2207,89 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
        if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
+       if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+               boolean_t missing_feat_read = B_FALSE;
+               nvlist_t *unsup_feat, *enabled_feat;
+
+               if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+                   &spa->spa_feat_for_read_obj) != 0) {
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               }
+
+               if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+                   &spa->spa_feat_for_write_obj) != 0) {
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               }
+
+               if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+                   &spa->spa_feat_desc_obj) != 0) {
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               }
+
+               enabled_feat = fnvlist_alloc();
+               unsup_feat = fnvlist_alloc();
+
+               if (!feature_is_supported(spa->spa_meta_objset,
+                   spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
+                   unsup_feat, enabled_feat))
+                       missing_feat_read = B_TRUE;
+
+               if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
+                       if (!feature_is_supported(spa->spa_meta_objset,
+                           spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
+                           unsup_feat, enabled_feat)) {
+                               missing_feat_write = B_TRUE;
+                       }
+               }
+
+               fnvlist_add_nvlist(spa->spa_load_info,
+                   ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+               if (!nvlist_empty(unsup_feat)) {
+                       fnvlist_add_nvlist(spa->spa_load_info,
+                           ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+               }
+
+               fnvlist_free(enabled_feat);
+               fnvlist_free(unsup_feat);
+
+               if (!missing_feat_read) {
+                       fnvlist_add_boolean(spa->spa_load_info,
+                           ZPOOL_CONFIG_CAN_RDONLY);
+               }
+
+               /*
+                * If the state is SPA_LOAD_TRYIMPORT, our objective is
+                * twofold: to determine whether the pool is available for
+                * import in read-write mode and (if it is not) whether the
+                * pool is available for import in read-only mode. If the pool
+                * is available for import in read-write mode, it is displayed
+                * as available in userland; if it is not available for import
+                * in read-only mode, it is displayed as unavailable in
+                * userland. If the pool is available for import in read-only
+                * mode but not read-write mode, it is displayed as unavailable
+                * in userland with a special note that the pool is actually
+                * available for open in read-only mode.
+                *
+                * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+                * missing a feature for write, we must first determine whether
+                * the pool can be opened read-only before returning to
+                * userland in order to know whether to display the
+                * abovementioned note.
+                */
+               if (missing_feat_read || (missing_feat_write &&
+                   spa_writeable(spa))) {
+                       return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+                           ENOTSUP));
+               }
+       }
+
+       spa->spa_is_initializing = B_TRUE;
+       error = dsl_pool_open(spa->spa_dsl_pool);
+       spa->spa_is_initializing = B_FALSE;
+       if (error != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
        if (!mosconfig) {
                uint64_t hostid;
                nvlist_t *policy = NULL, *nvconfig;
@@ -2247,7 +2507,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                nvlist_free(nvconfig);
 
                /*
-                * Now that we've validate the config, check the state of the
+                * Now that we've validated the config, check the state of the
                 * root vdev.  If it can't be opened, it indicates one or
                 * more toplevel vdevs are faulted.
                 */
@@ -2260,6 +2520,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                }
        }
 
+       if (missing_feat_write) {
+               ASSERT(state == SPA_LOAD_TRYIMPORT);
+
+               /*
+                * At this point, we know that we can open the pool in
+                * read-only mode but not read-write mode. We now have enough
+                * information and can return to userland.
+                */
+               return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
+       }
+
        /*
         * We've successfully opened the pool, verify that we're ready
         * to start pushing transactions.
@@ -2370,10 +2641,18 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
        return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
+/*
+ * If spa_load() fails this function will try loading prior txg's. If
+ * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
+ * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
+ * function will not rewind the pool and will return the same error as
+ * spa_load().
+ */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
+       nvlist_t *loadinfo = NULL;
        nvlist_t *config = NULL;
        int load_error, rewind_error;
        uint64_t safe_rewind_txg;
@@ -2402,9 +2681,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
                return (load_error);
        }
 
-       /* Price of rolling back is discarding txgs, including log */
-       if (state == SPA_LOAD_RECOVER)
+       if (state == SPA_LOAD_RECOVER) {
+               /* Price of rolling back is discarding txgs, including log */
                spa_set_log_state(spa, SPA_LOG_CLEAR);
+       } else {
+               /*
+                * If we aren't rolling back save the load info from our first
+                * import attempt so that we can restore it after attempting
+                * to rewind.
+                */
+               loadinfo = spa->spa_load_info;
+               spa->spa_load_info = fnvlist_alloc();
+       }
 
        spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
        safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
@@ -2428,7 +2716,20 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
        if (config && (rewind_error || state != SPA_LOAD_RECOVER))
                spa_config_set(spa, config);
 
-       return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
+       if (state == SPA_LOAD_RECOVER) {
+               ASSERT3P(loadinfo, ==, NULL);
+               return (rewind_error);
+       } else {
+               /* Store the rewind info as part of the initial load info */
+               fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
+                   spa->spa_load_info);
+
+               /* Restore the initial load info */
+               fnvlist_free(spa->spa_load_info);
+               spa->spa_load_info = loadinfo;
+
+               return (load_error);
+       }
 }
 
 /*
@@ -2698,8 +2999,50 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
        }
 }
 
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+       nvlist_t *features;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+
+       ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+       VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+       if (spa->spa_feat_for_read_obj != 0) {
+               for (zap_cursor_init(&zc, spa->spa_meta_objset,
+                   spa->spa_feat_for_read_obj);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+                           za.za_num_integers == 1);
+                       VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+                           za.za_first_integer));
+               }
+               zap_cursor_fini(&zc);
+       }
+
+       if (spa->spa_feat_for_write_obj != 0) {
+               for (zap_cursor_init(&zc, spa->spa_meta_objset,
+                   spa->spa_feat_for_write_obj);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+                           za.za_num_integers == 1);
+                       VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+                           za.za_first_integer));
+               }
+               zap_cursor_fini(&zc);
+       }
+
+       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+           features) == 0);
+       nvlist_free(features);
+}
+
 int
-spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
+spa_get_stats(const char *name, nvlist_t **config,
+    char *altroot, size_t buflen)
 {
        int error;
        spa_t *spa;
@@ -2734,6 +3077,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 
                        spa_add_spares(spa, *config);
                        spa_add_l2cache(spa, *config);
+                       spa_add_feature_stats(spa, *config);
                }
        }
 
@@ -2954,6 +3298,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        nvlist_t **spares, **l2cache;
        uint_t nspares, nl2cache;
        uint64_t version, obj;
+       boolean_t has_features;
+       nvpair_t *elem;
        int c;
 
        /*
@@ -2980,10 +3326,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                return (error);
        }
 
-       if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
-           &version) != 0)
+       has_features = B_FALSE;
+       for (elem = nvlist_next_nvpair(props, NULL);
+           elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
+               if (zpool_prop_feature(nvpair_name(elem)))
+                       has_features = B_TRUE;
+       }
+
+       if (has_features || nvlist_lookup_uint64(props,
+           zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
                version = SPA_VERSION;
-       ASSERT(version <= SPA_VERSION);
+       }
+       ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
        spa->spa_first_txg = txg;
        spa->spa_uberblock.ub_txg = txg - 1;
@@ -3059,8 +3413,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                spa->spa_l2cache.sav_sync = B_TRUE;
        }
 
+       spa->spa_is_initializing = B_TRUE;
        spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
        spa->spa_meta_objset = dp->dp_meta_objset;
+       spa->spa_is_initializing = B_FALSE;
 
        /*
         * Create DDTs (dedup tables).
@@ -3084,6 +3440,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                cmn_err(CE_PANIC, "failed to add pool config");
        }
 
+       if (spa_version(spa) >= SPA_VERSION_FEATURES)
+               spa_feature_create_zap_objects(spa, tx);
+
        if (zap_add(spa->spa_meta_objset,
            DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
            sizeof (uint64_t), 1, &version, tx) != 0) {
@@ -3276,7 +3635,7 @@ spa_import_rootpool(char *devpath, char *devid)
        }
 #endif
        if (config == NULL) {
-               cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+               cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
                    devpath);
                return (EIO);
        }
@@ -3590,6 +3949,8 @@ spa_tryimport(nvlist_t *tryconfig)
                    state) == 0);
                VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
                    spa->spa_uberblock.ub_timestamp) == 0);
+               VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+                   spa->spa_load_info) == 0);
 
                /*
                 * If the bootfs property exists on this pool then we
@@ -5305,7 +5666,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
         * information.  This avoids the dbuf_will_dirty() path and
         * saves us a pre-read to get data we don't actually care about.
         */
-       bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
+       bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
        packed = vmem_alloc(bufsize, KM_PUSHPAGE);
 
        VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
@@ -5381,6 +5742,14 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
        config = spa_config_generate(spa, spa->spa_root_vdev,
            dmu_tx_get_txg(tx), B_FALSE);
 
+       /*
+        * If we're upgrading the spa version then make sure that
+        * the config object gets updated with the correct version.
+        */
+       if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
+               fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+                   spa->spa_uberblock.ub_version);
+
        spa_config_exit(spa, SCL_STATE, FTAG);
 
        if (spa->spa_config_syncing)
@@ -5390,6 +5759,24 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
        spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
+static void
+spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       uint64_t version = *(uint64_t *)arg2;
+
+       /*
+        * Setting the version is special cased when first creating the pool.
+        */
+       ASSERT(tx->tx_txg != TXG_INITIAL);
+
+       ASSERT(version <= SPA_VERSION);
+       ASSERT(version >= spa_version(spa));
+
+       spa->spa_uberblock.ub_version = version;
+       vdev_config_dirty(spa->spa_root_vdev);
+}
+
 /*
  * Set zpool properties.
  */
@@ -5399,32 +5786,39 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
        spa_t *spa = arg1;
        objset_t *mos = spa->spa_meta_objset;
        nvlist_t *nvp = arg2;
-       nvpair_t *elem;
-       uint64_t intval;
-       char *strval;
-       zpool_prop_t prop;
-       const char *propname;
-       zprop_type_t proptype;
+       nvpair_t *elem = NULL;
 
        mutex_enter(&spa->spa_props_lock);
 
-       elem = NULL;
        while ((elem = nvlist_next_nvpair(nvp, elem))) {
-               switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+               uint64_t intval;
+               char *strval, *fname;
+               zpool_prop_t prop;
+               const char *propname;
+               zprop_type_t proptype;
+               zfeature_info_t *feature;
+
+               prop = zpool_name_to_prop(nvpair_name(elem));
+               switch ((int)prop) {
+               case ZPROP_INVAL:
+                       /*
+                        * We checked this earlier in spa_prop_validate().
+                        */
+                       ASSERT(zpool_prop_feature(nvpair_name(elem)));
+
+                       fname = strchr(nvpair_name(elem), '@') + 1;
+                       VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
+
+                       spa_feature_enable(spa, feature, tx);
+                       break;
+
                case ZPOOL_PROP_VERSION:
+                       VERIFY(nvpair_value_uint64(elem, &intval) == 0);
                        /*
-                        * Only set version for non-zpool-creation cases
-                        * (set/import). spa_create() needs special care
-                        * for version setting.
+                        * The version is synced seperatly before other
+                        * properties and should be correct by now.
                         */
-                       if (tx->tx_txg != TXG_INITIAL) {
-                               VERIFY(nvpair_value_uint64(elem,
-                                   &intval) == 0);
-                               ASSERT(intval <= SPA_VERSION);
-                               ASSERT(intval >= spa_version(spa));
-                               spa->spa_uberblock.ub_version = intval;
-                               vdev_config_dirty(spa->spa_root_vdev);
-                       }
+                       ASSERT3U(spa_version(spa), >=, intval);
                        break;
 
                case ZPOOL_PROP_ALTROOT:
@@ -5461,14 +5855,10 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
                         * Set pool property values in the poolprops mos object.
                         */
                        if (spa->spa_pool_props_object == 0) {
-                               VERIFY((spa->spa_pool_props_object =
-                                   zap_create(mos, DMU_OT_POOL_PROPS,
-                                   DMU_OT_NONE, 0, tx)) > 0);
-
-                               VERIFY(zap_update(mos,
+                               spa->spa_pool_props_object =
+                                   zap_create_link(mos, DMU_OT_POOL_PROPS,
                                    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
-                                   8, 1, &spa->spa_pool_props_object, tx)
-                                   == 0);
+                                   tx);
                        }
 
                        /* normalize the property name */
@@ -5567,6 +5957,11 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
                /* Keeping the freedir open increases spa_minref */
                spa->spa_minref += 3;
        }
+
+       if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
+           spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+               spa_feature_create_zap_objects(spa, tx);
+       }
 }
 
 /*
@@ -5738,6 +6133,9 @@ spa_sync(spa_t *spa, uint64_t txg)
                                    rvd->vdev_children, txg, B_TRUE);
                }
 
+               if (error == 0)
+                       spa->spa_last_synced_guid = rvd->vdev_guid;
+
                spa_config_exit(spa, SCL_STATE, FTAG);
 
                if (error == 0)
index c868841..09149e6 100644 (file)
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -35,6 +35,7 @@
 #include <sys/utsname.h>
 #include <sys/systeminfo.h>
 #include <sys/sunddi.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/kobj.h>
 #include <sys/zone.h>
@@ -408,6 +409,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
        VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
        nvlist_free(nvroot);
 
+       /*
+        * Store what's necessary for reading the MOS in the label.
+        */
+       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+           spa->spa_label_features) == 0);
+
        if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
                ddt_histogram_t *ddh;
                ddt_stat_t *dds;
index 4a8e6ad..5ec8e68 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -49,6 +49,7 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include "zfs_prop.h"
+#include "zfeature_common.h"
 
 /*
  * SPA locking
  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  *
- * spa_rename() is also implemented within this file since is requires
+ * spa_rename() is also implemented within this file since it requires
  * manipulation of the namespace.
  */
 
@@ -479,8 +480,22 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
        VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
            KM_PUSHPAGE) == 0);
 
-       if (config != NULL)
+       if (config != NULL) {
+               nvlist_t *features;
+
+               if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+                   &features) == 0) {
+                       VERIFY(nvlist_dup(features, &spa->spa_label_features,
+                           0) == 0);
+               }
+
                VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+       }
+
+       if (spa->spa_label_features == NULL) {
+               VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
+                   KM_SLEEP) == 0);
+       }
 
        return (spa);
 }
@@ -518,6 +533,7 @@ spa_remove(spa_t *spa)
 
        list_destroy(&spa->spa_config_list);
 
+       nvlist_free(spa->spa_label_features);
        nvlist_free(spa->spa_load_info);
        spa_config_set(spa, NULL);
 
@@ -1025,6 +1041,20 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
  * ==========================================================================
  */
 
+void
+spa_activate_mos_feature(spa_t *spa, const char *feature)
+{
+       (void) nvlist_add_boolean(spa->spa_label_features, feature);
+       vdev_config_dirty(spa->spa_root_vdev);
+}
+
+void
+spa_deactivate_mos_feature(spa_t *spa, const char *feature)
+{
+       (void) nvlist_remove_all(spa->spa_label_features, feature);
+       vdev_config_dirty(spa->spa_root_vdev);
+}
+
 /*
  * Rename a spa_t.
  */
@@ -1175,12 +1205,22 @@ spa_generate_guid(spa_t *spa)
 void
 sprintf_blkptr(char *buf, const blkptr_t *bp)
 {
-       char *type = NULL;
+       char type[256];
        char *checksum = NULL;
        char *compress = NULL;
 
        if (bp != NULL) {
-               type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+               if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
+                       dmu_object_byteswap_t bswap =
+                           DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+                       (void) snprintf(type, sizeof (type), "bswap %s %s",
+                           DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
+                           "metadata" : "data",
+                           dmu_ot_byteswap[bswap].ob_name);
+               } else {
+                       (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
+                           sizeof (type));
+               }
                checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
                compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
        }
@@ -1252,6 +1292,12 @@ spa_get_dsl(spa_t *spa)
        return (spa->spa_dsl_pool);
 }
 
+boolean_t
+spa_is_initializing(spa_t *spa)
+{
+       return (spa->spa_is_initializing);
+}
+
 blkptr_t *
 spa_get_rootblkptr(spa_t *spa)
 {
@@ -1288,16 +1334,29 @@ spa_name(spa_t *spa)
 uint64_t
 spa_guid(spa_t *spa)
 {
+       dsl_pool_t *dp = spa_get_dsl(spa);
+       uint64_t guid;
+
        /*
         * If we fail to parse the config during spa_load(), we can go through
         * the error path (which posts an ereport) and end up here with no root
         * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
         * this case.
         */
-       if (spa->spa_root_vdev != NULL)
+       if (spa->spa_root_vdev == NULL)
+               return (spa->spa_config_guid);
+
+       guid = spa->spa_last_synced_guid != 0 ?
+           spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+       /*
+        * Return the most recently synced out guid unless we're
+        * in syncing context.
+        */
+       if (dp && dsl_pool_sync_context(dp))
                return (spa->spa_root_vdev->vdev_guid);
        else
-               return (spa->spa_config_guid);
+               return (guid);
 }
 
 uint64_t
@@ -1532,6 +1591,7 @@ spa_init(int mode)
        vdev_cache_stat_init();
        zfs_prop_init();
        zpool_prop_init();
+       zpool_feature_init();
        spa_config_load();
        l2arc_start();
 }
index 17494bc..838a6f6 100644 (file)
@@ -20,6 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -669,7 +671,7 @@ txg_list_destroy(txg_list_t *tl)
        mutex_destroy(&tl->tl_lock);
 }
 
-int
+boolean_t
 txg_list_empty(txg_list_t *tl, uint64_t txg)
 {
        return (tl->tl_head[txg & TXG_MASK] == NULL);
index e374f6d..b969751 100644 (file)
@@ -1348,8 +1348,9 @@ vdev_validate(vdev_t *vd, boolean_t strict)
        if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
                uint64_t aux_guid = 0;
                nvlist_t *nvl;
+               uint64_t txg = strict ? spa->spa_config_txg : -1ULL;
 
-               if ((label = vdev_label_read_config(vd)) == NULL) {
+               if ((label = vdev_label_read_config(vd, txg)) == NULL) {
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
                            VDEV_AUX_BAD_LABEL);
                        return (0);
@@ -1532,7 +1533,7 @@ vdev_reopen(vdev_t *vd)
                    !l2arc_vdev_present(vd))
                        l2arc_add_vdev(spa, vd);
        } else {
-               (void) vdev_validate(vd, B_TRUE);
+               (void) vdev_validate(vd, spa_last_synced_txg(spa));
        }
 
        /*
@@ -1993,14 +1994,14 @@ vdev_validate_aux(vdev_t *vd)
        if (!vdev_readable(vd))
                return (0);
 
-       if ((label = vdev_label_read_config(vd)) == NULL) {
+       if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
                vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
                    VDEV_AUX_CORRUPT_DATA);
                return (-1);
        }
 
        if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
-           version > SPA_VERSION ||
+           !SPA_VERSION_IS_SUPPORTED(version) ||
            nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
            guid != vd->vdev_guid ||
            nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
index 7ac2350..1fe36fe 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
  *     txg             Transaction group in which this label was written
  *     pool_guid       Unique identifier for this pool
  *     vdev_tree       An nvlist describing vdev tree.
+ *     features_for_read
+ *                     An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
@@ -428,13 +432,23 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config)
        kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 }
 
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
 nvlist_t *
-vdev_label_read_config(vdev_t *vd)
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
        spa_t *spa = vd->vdev_spa;
        nvlist_t *config = NULL;
        vdev_phys_t *vp;
        zio_t *zio;
+       uint64_t best_txg = 0;
+       int error = 0;
        int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
            ZIO_FLAG_SPECULATIVE;
        int l;
@@ -448,6 +462,7 @@ vdev_label_read_config(vdev_t *vd)
 
 retry:
        for (l = 0; l < VDEV_LABELS; l++) {
+               nvlist_t *label = NULL;
 
                zio = zio_root(spa, NULL, NULL, flags);
 
@@ -457,12 +472,31 @@ retry:
 
                if (zio_wait(zio) == 0 &&
                    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-                   &config, 0) == 0)
-                       break;
+                   &label, 0) == 0) {
+                       uint64_t label_txg = 0;
+
+                       /*
+                        * Auxiliary vdevs won't have txg values in their
+                        * labels and newly added vdevs may not have been
+                        * completely initialized so just return the
+                        * configuration from the first valid label we
+                        * encounter.
+                        */
+                       error = nvlist_lookup_uint64(label,
+                           ZPOOL_CONFIG_POOL_TXG, &label_txg);
+                       if ((error || label_txg == 0) && !config) {
+                               config = label;
+                               break;
+                       } else if (label_txg <= txg && label_txg > best_txg) {
+                               best_txg = label_txg;
+                               nvlist_free(config);
+                               config = fnvlist_dup(label);
+                       }
+               }
 
-               if (config != NULL) {
-                       nvlist_free(config);
-                       config = NULL;
+               if (label != NULL) {
+                       nvlist_free(label);
+                       label = NULL;
                }
        }
 
@@ -497,7 +531,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
        /*
         * Read the label, if any, and perform some basic sanity checks.
         */
-       if ((label = vdev_label_read_config(vd)) == NULL)
+       if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
                return (B_FALSE);
 
        (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
@@ -838,7 +872,7 @@ retry:
  * come back up, we fail to see the uberblock for txg + 1 because, say,
  * it was on a mirrored device and the replica to which we wrote txg + 1
  * is now offline.  If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a new seconds we'll have two
+ * the missing replica comes back, then for a few seconds we'll have two
  * conflicting uberblocks on disk with the same txg.  The solution is simple:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
@@ -858,47 +892,49 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
        return (0);
 }
 
+struct ubl_cbdata {
+       uberblock_t     *ubl_ubbest;    /* Best uberblock */
+       vdev_t          *ubl_vd;        /* vdev associated with the above */
+};
+
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+       vdev_t *vd = zio->io_vd;
        spa_t *spa = zio->io_spa;
        zio_t *rio = zio->io_private;
        uberblock_t *ub = zio->io_data;
-       uberblock_t *ubbest = rio->io_private;
+       struct ubl_cbdata *cbp = rio->io_private;
 
-       ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+       ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 
        if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
                mutex_enter(&rio->io_lock);
                if (ub->ub_txg <= spa->spa_load_max_txg &&
-                   vdev_uberblock_compare(ub, ubbest) > 0)
-                       *ubbest = *ub;
+                   vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+                       /*
+                        * Keep track of the vdev in which this uberblock
+                        * was found. We will use this information later
+                        * to obtain the config nvlist associated with
+                        * this uberblock.
+                        */
+                       *cbp->ubl_ubbest = *ub;
+                       cbp->ubl_vd = vd;
+               }
                mutex_exit(&rio->io_lock);
        }
 
        zio_buf_free(zio->io_data, zio->io_size);
 }
 
-void
-vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+    struct ubl_cbdata *cbp)
 {
-       spa_t *spa = vd->vdev_spa;
-       vdev_t *rvd = spa->spa_root_vdev;
-       int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
-           ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
        int c, l, n;
 
-       if (vd == rvd) {
-               ASSERT(zio == NULL);
-               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-               zio = zio_root(spa, NULL, ubbest, flags);
-               bzero(ubbest, sizeof (uberblock_t));
-       }
-
-       ASSERT(zio != NULL);
-
        for (c = 0; c < vd->vdev_children; c++)
-               vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+               vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
        if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
                for (l = 0; l < VDEV_LABELS; l++) {
@@ -911,11 +947,46 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
                        }
                }
        }
+}
 
-       if (vd == rvd) {
-               (void) zio_wait(zio);
-               spa_config_exit(spa, SCL_ALL, FTAG);
-       }
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+       zio_t *zio;
+       spa_t *spa = rvd->vdev_spa;
+       struct ubl_cbdata cb;
+       int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+           ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+       ASSERT(ub);
+       ASSERT(config);
+
+       bzero(ub, sizeof (uberblock_t));
+       *config = NULL;
+
+       cb.ubl_ubbest = ub;
+       cb.ubl_vd = NULL;
+
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       zio = zio_root(spa, NULL, &cb, flags);
+       vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+       (void) zio_wait(zio);
+
+       /*
+        * It's possible that the best uberblock was discovered on a label
+        * that has a configuration which was written in a future txg.
+        * Search all labels on this vdev to find the configuration that
+        * matches the txg for our uberblock.
+        */
+       if (cb.ubl_vd != NULL)
+               *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+       spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
index fac54ea..fd3021b 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -946,6 +947,19 @@ fzap_prefetch(zap_name_t *zn)
  * Helper functions for consumers.
  */
 
+uint64_t
+zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+    const char *name, dmu_tx_t *tx)
+{
+       uint64_t new_obj;
+
+       VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+       VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+           tx) == 0);
+
+       return (new_obj);
+}
+
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name)
@@ -1080,6 +1094,16 @@ zap_add_int_key(objset_t *os, uint64_t obj,
 }
 
 int
+zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+       char name[20];
+
+       (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+       return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
        char name[20];
index d5b97da..3d8cae0 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zio.h>
@@ -461,7 +461,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
        {
                dmu_object_info_t doi;
                dmu_object_info_from_db(db, &doi);
-               ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+               ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
        }
 #endif
 
@@ -585,7 +585,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
        {
                dmu_object_info_t doi;
                dmu_object_info_from_db(db, &doi);
-               ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+               ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
        }
 #endif
 
diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c
new file mode 100644 (file)
index 0000000..c09b32d
--- /dev/null
@@ -0,0 +1,432 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfeature.h>
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include "zfeature_common.h"
+#include <sys/spa_impl.h>
+
+/*
+ * ZFS Feature Flags
+ * -----------------
+ *
+ * ZFS feature flags are used to provide fine-grained versioning to the ZFS
+ * on-disk format. Once enabled on a pool feature flags replace the old
+ * spa_version() number.
+ *
+ * Each new on-disk format change will be given a uniquely identifying string
+ * guid rather than a version number. This avoids the problem of different
+ * organizations creating new on-disk formats with the same version number. To
+ * keep feature guids unique they should consist of the reverse dns name of the
+ * organization which implemented the feature and a short name for the feature,
+ * separated by a colon (e.g. com.delphix:async_destroy).
+ *
+ * Reference Counts
+ * ----------------
+ *
+ * Within each pool features can be in one of three states: disabled, enabled,
+ * or active. These states are differentiated by a reference count stored on
+ * disk for each feature:
+ *
+ *   1) If there is no reference count stored on disk the feature is disabled.
+ *   2) If the reference count is 0 a system administrator has enabled the
+ *      feature, but the feature has not been used yet, so no on-disk
+ *      format changes have been made.
+ *   3) If the reference count is greater than 0 the feature is active.
+ *      The format changes required by the feature are currently on disk.
+ *      Note that if the feature's format changes are reversed the feature
+ *      may choose to set its reference count back to 0.
+ *
+ * Feature flags makes no differentiation between non-zero reference counts
+ * for an active feature (e.g. a reference count of 1 means the same thing as a
+ * reference count of 27834721), but feature implementations may choose to use
+ * the reference count to store meaningful information. For example, a new RAID
+ * implementation might set the reference count to the number of vdevs using
+ * it. If all those disks are removed from the pool the feature goes back to
+ * having a reference count of 0.
+ *
+ * It is the responsibility of the individual features to maintain a non-zero
+ * reference count as long as the feature's format changes are present on disk.
+ *
+ * Dependencies
+ * ------------
+ *
+ * Each feature may depend on other features. The only effect of this
+ * relationship is that when a feature is enabled all of its dependencies are
+ * automatically enabled as well. Any future work to support disabling of
+ * features would need to ensure that features cannot be disabled if other
+ * enabled features depend on them.
+ *
+ * On-disk Format
+ * --------------
+ *
+ * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
+ * (5000). In order for this to work the pool is automatically upgraded to
+ * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
+ * format changes will be in use.
+ *
+ * Information about features is stored in 3 ZAP objects in the pool's MOS.
+ * These objects are linked to by the following names in the pool directory
+ * object:
+ *
+ * 1) features_for_read: feature guid -> reference count
+ *    Features needed to open the pool for reading.
+ * 2) features_for_write: feature guid -> reference count
+ *    Features needed to open the pool for writing.
+ * 3) feature_descriptions: feature guid -> descriptive string
+ *    A human readable string.
+ *
+ * All enabled features appear in either features_for_read or
+ * features_for_write, but not both.
+ *
+ * To open a pool in read-only mode only the features listed in
+ * features_for_read need to be supported.
+ *
+ * To open the pool in read-write mode features in both features_for_read and
+ * features_for_write need to be supported.
+ *
+ * Some features may be required to read the ZAP objects containing feature
+ * information. To allow software to check for compatibility with these features
+ * before the pool is opened their names must be stored in the label in a
+ * new "features_for_read" entry (note that features that are only required
+ * to write to a pool never need to be stored in the label since the
+ * features_for_write ZAP object can be read before the pool is written to).
+ * To save space in the label features must be explicitly marked as needing to
+ * be written to the label. Also, reference counts are not stored in the label,
+ * instead any feature whose reference count drops to 0 is removed from the
+ * label.
+ *
+ * Adding New Features
+ * -------------------
+ *
+ * Features must be registered in zpool_feature_init() function in
+ * zfeature_common.c using the zfeature_register() function. This function
+ * has arguments to specify if the feature should be stored in the
+ * features_for_read or features_for_write ZAP object and if it needs to be
+ * written to the label when active.
+ *
+ * Once a feature is registered it will appear as a "feature@<feature name>"
+ * property which can be set by an administrator. Feature implementors should
+ * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
+ * query the state of a feature and the spa_feature_incr() and
+ * spa_feature_decr() functions to change an enabled feature's reference count.
+ * Reference counts may only be updated in the syncing context.
+ *
+ * Features may not perform enable-time initialization. Instead, any such
+ * initialization should occur when the feature is first used. This design
+ * enforces that on-disk changes be made only when features are used. Code
+ * should only check if a feature is enabled using spa_feature_is_enabled(),
+ * not by relying on any feature specific metadata existing. If a feature is
+ * enabled, but the feature's metadata is not on disk yet then it should be
+ * created as needed.
+ *
+ * As an example, consider the com.delphix:async_destroy feature. This feature
+ * relies on the existence of a bptree in the MOS that store blocks for
+ * asynchronous freeing. This bptree is not created when async_destroy is
+ * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
+ * called to check if async_destroy is enabled. If it is and the bptree object
+ * does not exist yet, the bptree object is created as part of the dataset
+ * destroy and async_destroy's reference count is incremented to indicate it
+ * has made an on-disk format change. Later, after the destroyed dataset's
+ * blocks have all been asynchronously freed there is no longer any use for the
+ * bptree object, so it is destroyed and async_destroy's reference count is
+ * decremented back to 0 to indicate that it has undone its on-disk format
+ * changes.
+ */
+
+typedef enum {
+       FEATURE_ACTION_ENABLE,
+       FEATURE_ACTION_INCR,
+       FEATURE_ACTION_DECR,
+} feature_action_t;
+
+/*
+ * Checks that the features active in the specified object are supported by
+ * this software.  Adds each unsupported feature (name -> description) to
+ * the supplied nvlist.
+ */
+boolean_t
+feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj,
+    nvlist_t *unsup_feat, nvlist_t *enabled_feat)
+{
+       boolean_t supported;
+       zap_cursor_t *zc;
+       zap_attribute_t *za;
+       char *buf;
+
+       zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP);
+       za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP);
+       buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+       supported = B_TRUE;
+       for (zap_cursor_init(zc, os, obj);
+           zap_cursor_retrieve(zc, za) == 0;
+           zap_cursor_advance(zc)) {
+               ASSERT(za->za_integer_length == sizeof (uint64_t) &&
+                   za->za_num_integers == 1);
+
+               if (NULL != enabled_feat) {
+                       fnvlist_add_uint64(enabled_feat, za->za_name,
+                           za->za_first_integer);
+               }
+
+               if (za->za_first_integer != 0 &&
+                   !zfeature_is_supported(za->za_name)) {
+                       supported = B_FALSE;
+
+                       if (NULL != unsup_feat) {
+                               char *desc = "";
+
+                               if (zap_lookup(os, desc_obj, za->za_name,
+                                   1, sizeof (buf), buf) == 0)
+                                       desc = buf;
+
+                               VERIFY(nvlist_add_string(unsup_feat,
+                                   za->za_name, desc) == 0);
+                       }
+               }
+       }
+       zap_cursor_fini(zc);
+
+       kmem_free(buf, MAXPATHLEN);
+       kmem_free(za, sizeof(zap_attribute_t));
+       kmem_free(zc, sizeof(zap_cursor_t));
+
+       return (supported);
+}
+
+static int
+feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
+    zfeature_info_t *feature, uint64_t *res)
+{
+       int err;
+       uint64_t refcount;
+       uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+
+       /*
+        * If the pool is currently being created, the feature objects may not
+        * have been allocated yet.  Act as though all features are disabled.
+        */
+       if (zapobj == 0)
+               return (ENOTSUP);
+
+       err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
+           &refcount);
+       if (err != 0) {
+               if (err == ENOENT)
+                       return (ENOTSUP);
+               else
+                       return (err);
+       }
+       *res = refcount;
+       return (0);
+}
+
+static int
+feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj,
+    uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action,
+    dmu_tx_t *tx)
+{
+       int error;
+       uint64_t refcount;
+       uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+
+       ASSERT(0 != zapobj);
+       ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+       error = zap_lookup(os, zapobj, feature->fi_guid,
+           sizeof (uint64_t), 1, &refcount);
+
+       /*
+        * If we can't ascertain the status of the specified feature, an I/O
+        * error occurred.
+        */
+       if (error != 0 && error != ENOENT)
+               return (error);
+
+       switch (action) {
+       case FEATURE_ACTION_ENABLE:
+               /*
+                * If the feature is already enabled, ignore the request.
+                */
+               if (error == 0)
+                       return (0);
+               refcount = 0;
+               break;
+       case FEATURE_ACTION_INCR:
+               if (error == ENOENT)
+                       return (ENOTSUP);
+               if (refcount == UINT64_MAX)
+                       return (EOVERFLOW);
+               refcount++;
+               break;
+       case FEATURE_ACTION_DECR:
+               if (error == ENOENT)
+                       return (ENOTSUP);
+               if (refcount == 0)
+                       return (EOVERFLOW);
+               refcount--;
+               break;
+       default:
+               ASSERT(0);
+               break;
+       }
+
+       if (action == FEATURE_ACTION_ENABLE) {
+               int i;
+
+               for (i = 0; feature->fi_depends[i] != NULL; i++) {
+                       zfeature_info_t *dep = feature->fi_depends[i];
+
+                       error = feature_do_action(os, read_obj, write_obj,
+                           desc_obj, dep, FEATURE_ACTION_ENABLE, tx);
+                       if (error != 0)
+                               return (error);
+               }
+       }
+
+       error = zap_update(os, zapobj, feature->fi_guid,
+           sizeof (uint64_t), 1, &refcount, tx);
+       if (error != 0)
+               return (error);
+
+       if (action == FEATURE_ACTION_ENABLE) {
+               error = zap_update(os, desc_obj,
+                   feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+                   feature->fi_desc, tx);
+               if (error != 0)
+                       return (error);
+       }
+
+       if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) {
+               spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid);
+       }
+
+       if (action == FEATURE_ACTION_DECR && refcount == 0) {
+               spa_deactivate_mos_feature(dmu_objset_spa(os),
+                   feature->fi_guid);
+       }
+
+       return (0);
+}
+
+void
+spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
+{
+       /*
+        * We create feature flags ZAP objects in two instances: during pool
+        * creation and during pool upgrade.
+        */
+       ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
+           tx->tx_txg == TXG_INITIAL));
+
+       spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
+           DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_FEATURES_FOR_READ, tx);
+       spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
+           DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_FEATURES_FOR_WRITE, tx);
+       spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
+           DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_FEATURE_DESCRIPTIONS, tx);
+}
+
+/*
+ * Enable any required dependencies, then enable the requested feature.
+ */
+void
+spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+       ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+       VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
+           spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+           spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
+}
+
+/*
+ * If the specified feature has not yet been enabled, this function returns
+ * ENOTSUP; otherwise, this function increments the feature's refcount (or
+ * returns EOVERFLOW if the refcount cannot be incremented). This function must
+ * be called from syncing context.
+ */
+void
+spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+       ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+       VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
+           spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+           spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
+}
+
+/*
+ * If the specified feature has not yet been enabled, this function returns
+ * ENOTSUP; otherwise, this function decrements the feature's refcount (or
+ * returns EOVERFLOW if the refcount is already 0). This function must
+ * be called from syncing context.
+ */
+void
+spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+       ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+       VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
+           spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+           spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
+{
+       int err;
+       uint64_t refcount = 0;
+
+       if (spa_version(spa) < SPA_VERSION_FEATURES)
+               return (B_FALSE);
+
+       err = feature_get_refcount(spa->spa_meta_objset,
+           spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+           feature, &refcount);
+       ASSERT(err == 0 || err == ENOTSUP);
+       return (err == 0);
+}
+
+boolean_t
+spa_feature_is_active(spa_t *spa, zfeature_info_t *feature)
+{
+       int err;
+       uint64_t refcount = 0;
+
+       if (spa_version(spa) < SPA_VERSION_FEATURES)
+               return (B_FALSE);
+
+       err = feature_get_refcount(spa->spa_meta_objset,
+           spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
+           feature, &refcount);
+       ASSERT(err == 0 || err == ENOTSUP);
+       return (err == 0 && refcount > 0);
+}
diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c
new file mode 100644 (file)
index 0000000..4006699
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <errno.h>
+#include <string.h>
+#endif
+#include <sys/debug.h>
+#include <sys/fs/zfs.h>
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include "zfeature_common.h"
+
+/*
+ * Set to disable all feature checks while opening pools, allowing pools with
+ * unsupported features to be opened. Set for testing only.
+ */
+boolean_t zfeature_checks_disable = B_FALSE;
+
+zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+/*
+ * Valid characters for feature guids. This list is mainly for aesthetic
+ * purposes and could be expanded in the future. There are different allowed
+ * characters in the guids reverse dns portion (before the colon) and its
+ * short name (after the colon).
+ */
+static int
+valid_char(char c, boolean_t after_colon)
+{
+       return ((c >= 'a' && c <= 'z') ||
+           (c >= '0' && c <= '9') ||
+           c == (after_colon ? '_' : '.'));
+}
+
+/*
+ * Every feature guid must contain exactly one colon which separates a reverse
+ * dns organization name from the feature's "short" name (e.g.
+ * "com.company:feature_name").
+ */
+boolean_t
+zfeature_is_valid_guid(const char *name)
+{
+       int i;
+       boolean_t has_colon = B_FALSE;
+
+       i = 0;
+       while (name[i] != '\0') {
+               char c = name[i++];
+               if (c == ':') {
+                       if (has_colon)
+                               return (B_FALSE);
+                       has_colon = B_TRUE;
+                       continue;
+               }
+               if (!valid_char(c, has_colon))
+                       return (B_FALSE);
+       }
+
+       return (has_colon);
+}
+
+boolean_t
+zfeature_is_supported(const char *guid)
+{
+       if (zfeature_checks_disable)
+               return (B_TRUE);
+
+       return (0 == zfeature_lookup_guid(guid, NULL));
+}
+
+int
+zfeature_lookup_guid(const char *guid, zfeature_info_t **res)
+{
+       int i;
+
+       for (i = 0; i < SPA_FEATURES; i++) {
+               zfeature_info_t *feature = &spa_feature_table[i];
+               if (strcmp(guid, feature->fi_guid) == 0) {
+                       if (res != NULL)
+                               *res = feature;
+                       return (0);
+               }
+       }
+
+       return (ENOENT);
+}
+
+int
+zfeature_lookup_name(const char *name, zfeature_info_t **res)
+{
+       int i;
+
+       for (i = 0; i < SPA_FEATURES; i++) {
+               zfeature_info_t *feature = &spa_feature_table[i];
+               if (strcmp(name, feature->fi_uname) == 0) {
+                       if (res != NULL)
+                               *res = feature;
+                       return (0);
+               }
+       }
+
+       return (ENOENT);
+}
+
+static void
+zfeature_register(int fid, const char *guid, const char *name, const char *desc,
+    boolean_t readonly, boolean_t mos, zfeature_info_t **deps)
+{
+       zfeature_info_t *feature = &spa_feature_table[fid];
+       static zfeature_info_t *nodeps[] = { NULL };
+
+       ASSERT(name != NULL);
+       ASSERT(desc != NULL);
+       ASSERT(!readonly || !mos);
+       ASSERT3U(fid, <, SPA_FEATURES);
+       ASSERT(zfeature_is_valid_guid(guid));
+
+       if (deps == NULL)
+               deps = nodeps;
+
+       feature->fi_guid = guid;
+       feature->fi_uname = name;
+       feature->fi_desc = desc;
+       feature->fi_can_readonly = readonly;
+       feature->fi_mos = mos;
+       feature->fi_depends = deps;
+}
+
+void
+zpool_feature_init(void)
+{
+       zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
+           "com.delphix:async_destroy", "async_destroy",
+           "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
+       zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+           "com.delphix:empty_bpobj", "empty_bpobj",
+           "Snapshots use less space.", B_TRUE, B_FALSE, NULL);
+}
index c609203..98b1dd7 100644 (file)
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
@@ -1107,6 +1108,8 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp)
 /*
  * Find a zfs_sb_t for a mounted filesystem, or create our own, in which
  * case its z_sb will be NULL, and it will be opened as the owner.
+ * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
+ * which prevents all inode ops from running.
  */
 static int
 zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
@@ -1170,7 +1173,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 
                (void) nvlist_lookup_uint64(props,
                    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
-               if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) {
+               if (!SPA_VERSION_IS_SUPPORTED(version)) {
                        error = EINVAL;
                        goto pool_props_bad;
                }
@@ -1297,6 +1300,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
        return (error);
 }
 
+/*
+ * inputs:
+ * zc_name             name of the pool
+ *
+ * outputs:
+ * zc_cookie           real errno
+ * zc_nvlist_dst       config nvlist
+ * zc_nvlist_dst_size  size of config nvlist
+ */
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
@@ -1398,7 +1410,8 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
        if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
                return (error);
 
-       if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) {
+       if (zc->zc_cookie < spa_version(spa) ||
+           !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
                spa_close(spa, FTAG);
                return (EINVAL);
        }
index 220f2d7..c9618c1 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -480,6 +480,38 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite)
 }
 
 /*
+ * Called when we create in-memory log transactions so that we know
+ * to cleanup the itxs at the end of spa_sync().
+ */
+void
+zilog_dirty(zilog_t *zilog, uint64_t txg)
+{
+       dsl_pool_t *dp = zilog->zl_dmu_pool;
+       dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
+       if (dsl_dataset_is_snapshot(ds))
+               panic("dirtying snapshot!");
+
+       if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) {
+               /* up the hold count until we can be written out */
+               dmu_buf_add_ref(ds->ds_dbuf, zilog);
+       }
+}
+
+boolean_t
+zilog_is_dirty(zilog_t *zilog)
+{
+       dsl_pool_t *dp = zilog->zl_dmu_pool;
+       int t;
+
+       for (t = 0; t < TXG_SIZE; t++) {
+               if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
+                       return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
+/*
  * Create an on-disk intent log.
  */
 static lwb_t *
@@ -601,14 +633,21 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
                        kmem_cache_free(zil_lwb_cache, lwb);
                }
        } else if (!keep_first) {
-               (void) zil_parse(zilog, zil_free_log_block,
-                   zil_free_log_record, tx, zh->zh_claim_txg);
+               zil_destroy_sync(zilog, tx);
        }
        mutex_exit(&zilog->zl_lock);
 
        dmu_tx_commit(tx);
 }
 
+void
+zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+       ASSERT(list_is_empty(&zilog->zl_lwb_list));
+       (void) zil_parse(zilog, zil_free_log_block,
+           zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
+}
+
 int
 zil_claim(const char *osname, void *txarg)
 {
@@ -1042,6 +1081,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                return (NULL);
 
        ASSERT(lwb->lwb_buf != NULL);
+       ASSERT(zilog_is_dirty(zilog) ||
+           spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 
        if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
                dlen = P2ROUNDUP_TYPED(
@@ -1272,7 +1313,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
        if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
                zil_async_to_sync(zilog, itx->itx_oid);
 
-       if (spa_freeze_txg(zilog->zl_spa) !=  UINT64_MAX)
+       if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
                txg = ZILTEST_TXG;
        else
                txg = dmu_tx_get_txg(tx);
@@ -1323,6 +1364,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
        }
 
        itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+       zilog_dirty(zilog, txg);
        mutex_exit(&itxg->itxg_lock);
 
        /* Release the old itxs now we've dropped the lock */
@@ -1332,7 +1374,10 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 
 /*
  * If there are any in-memory intent log transactions which have now been
- * synced then start up a taskq to free them.
+ * synced then start up a taskq to free them. We should only do this after we
+ * have written out the uberblocks (i.e. txg has been comitted) so that
+ * don't inadvertently clean out in-memory log records that would be required
+ * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
@@ -1837,6 +1882,7 @@ zil_close(zilog_t *zilog)
        mutex_exit(&zilog->zl_lock);
        if (txg)
                txg_wait_synced(zilog->zl_dmu_pool, txg);
+       ASSERT(!zilog_is_dirty(zilog));
 
        taskq_destroy(zilog->zl_clean_taskq);
        zilog->zl_clean_taskq = NULL;
index bfb817b..638105a 100644 (file)
@@ -704,7 +704,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
            zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
            zp->zp_compress >= ZIO_COMPRESS_OFF &&
            zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
-           zp->zp_type < DMU_OT_NUMTYPES &&
+           DMU_OT_IS_VALID(zp->zp_type) &&
            zp->zp_level < 32 &&
            zp->zp_copies > 0 &&
            zp->zp_copies <= spa_max_replication(spa) &&
@@ -988,7 +988,7 @@ zio_read_bp_init(zio_t *zio)
                zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
        }
 
-       if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+       if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
                zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
        if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
@@ -1248,7 +1248,7 @@ __zio_execute(zio_t *zio)
        while (zio->io_stage < ZIO_STAGE_DONE) {
                enum zio_stage pipeline = zio->io_pipeline;
                enum zio_stage stage = zio->io_stage;
-               dsl_pool_t *dsl;
+               dsl_pool_t *dp;
                boolean_t cut;
                int rv;
 
@@ -1262,7 +1262,7 @@ __zio_execute(zio_t *zio)
 
                ASSERT(stage <= ZIO_STAGE_DONE);
 
-               dsl = spa_get_dsl(zio->io_spa);
+               dp = spa_get_dsl(zio->io_spa);
                cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
                    zio_requeue_io_start_cut_in_line : B_FALSE;
 
@@ -1272,16 +1272,24 @@ __zio_execute(zio_t *zio)
                 * or may wait for an I/O that needs an interrupt thread
                 * to complete, issue async to avoid deadlock.
                 *
-                * If we are in the txg_sync_thread or being called
-                * during pool init issue async to minimize stack depth.
-                * Both of these call paths may be recursively called.
-                *
                 * For VDEV_IO_START, we cut in line so that the io will
                 * be sent to disk promptly.
                 */
-               if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
-                   zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) ||
-                   (dsl != NULL && dsl_pool_sync_context(dsl))) {
+               if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+                   zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+                       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+                       return;
+               }
+
+               /*
+                * If we executing in the context of the tx_sync_thread,
+                * or we are performing pool initialization outside of a
+                * zio_taskq[ZIO_TASKQ_ISSUE] context.  Then issue the zio
+                * async to minimize stack usage for these deep call paths.
+                */
+               if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
+                   (dp && spa_is_initializing(dp->dp_spa) &&
+                   !zio_taskq_member(zio, ZIO_TASKQ_ISSUE))) {
                        zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
                        return;
                }
@@ -3131,6 +3139,48 @@ static zio_pipe_stage_t *zio_pipeline[] = {
        zio_done
 };
 
+/* dnp is the dnode for zb1->zb_object */
+boolean_t
+zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+    const zbookmark_t *zb2)
+{
+       uint64_t zb1nextL0, zb2thisobj;
+
+       ASSERT(zb1->zb_objset == zb2->zb_objset);
+       ASSERT(zb2->zb_level == 0);
+
+       /*
+        * A bookmark in the deadlist is considered to be after
+        * everything else.
+        */
+       if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+               return (B_TRUE);
+
+       /* The objset_phys_t isn't before anything. */
+       if (dnp == NULL)
+               return (B_FALSE);
+
+       zb1nextL0 = (zb1->zb_blkid + 1) <<
+           ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+       zb2thisobj = zb2->zb_object ? zb2->zb_object :
+           zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+       if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+               uint64_t nextobj = zb1nextL0 *
+                   (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+               return (nextobj <= zb2thisobj);
+       }
+
+       if (zb1->zb_object < zb2thisobj)
+               return (B_TRUE);
+       if (zb1->zb_object > zb2thisobj)
+               return (B_FALSE);
+       if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+               return (B_FALSE);
+       return (zb1nextL0 <= zb2->zb_blkid);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Fault injection */
 EXPORT_SYMBOL(zio_injection_enabled);