Rebase master to b121
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 18 Aug 2009 18:43:27 +0000 (11:43 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 18 Aug 2009 18:43:27 +0000 (11:43 -0700)
64 files changed:
ZFS.RELEASE
cmd/zdb/zdb.c
cmd/zfs/zfs_iter.c
cmd/zfs/zfs_main.c
cmd/zpool/zpool_main.c
cmd/zpool/zpool_vdev.c
cmd/ztest/ztest.c
lib/libzfs/include/libzfs.h
lib/libzfs/libzfs_changelist.c
lib/libzfs/libzfs_dataset.c
lib/libzfs/libzfs_sendrecv.c
lib/libzfs/libzfs_status.c
lib/libzfs/libzfs_util.c
module/zcommon/include/sys/fs/zfs.h
module/zcommon/include/zfs_deleg.h
module/zcommon/zfs_deleg.c
module/zcommon/zfs_namecheck.c
module/zcommon/zfs_prop.c
module/zfs/dmu.c
module/zfs/dmu_objset.c
module/zfs/dmu_send.c
module/zfs/dmu_tx.c
module/zfs/dnode.c
module/zfs/dsl_dataset.c
module/zfs/dsl_deleg.c
module/zfs/dsl_dir.c
module/zfs/dsl_prop.c
module/zfs/dsl_scrub.c
module/zfs/include/sys/dmu.h
module/zfs/include/sys/dmu_impl.h
module/zfs/include/sys/dmu_objset.h
module/zfs/include/sys/dsl_dataset.h
module/zfs/include/sys/dsl_deleg.h
module/zfs/include/sys/dsl_prop.h
module/zfs/include/sys/spa.h
module/zfs/include/sys/spa_impl.h
module/zfs/include/sys/vdev.h
module/zfs/include/sys/vdev_impl.h
module/zfs/include/sys/zap.h
module/zfs/include/sys/zfs_acl.h
module/zfs/include/sys/zfs_ioctl.h
module/zfs/include/sys/zfs_vfsops.h
module/zfs/include/sys/zfs_znode.h
module/zfs/include/sys/zio.h
module/zfs/rrwlock.c
module/zfs/spa.c
module/zfs/spa_config.c
module/zfs/spa_history.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_label.c
module/zfs/vdev_mirror.c
module/zfs/vdev_queue.c
module/zfs/vdev_raidz.c
module/zfs/vdev_root.c
module/zfs/zap_micro.c
module/zfs/zfs_acl.c
module/zfs/zfs_ctldir.c
module/zfs/zfs_fuid.c
module/zfs/zfs_ioctl.c
module/zfs/zfs_vfsops.c
module/zfs/zfs_vnops.c
module/zfs/zfs_znode.c
module/zfs/zil.c

index 8b7cfb5..0960bf4 100644 (file)
@@ -1 +1 @@
-http://dlc.sun.com/osol/on/downloads/b117/on-src.tar.bz2
+http://dlc.sun.com/osol/on/downloads/b121/on-src.tar.bz2
index a310fc3..292bb51 100644 (file)
@@ -818,6 +818,8 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
            (u_longlong_t)ds->ds_snapnames_zapobj);
        (void) printf("\t\tnum_children = %llu\n",
            (u_longlong_t)ds->ds_num_children);
+       (void) printf("\t\tuserrefs_obj = %llu\n",
+           (u_longlong_t)ds->ds_userrefs_obj);
        (void) printf("\t\tcreation_time = %s", ctime(&crtime));
        (void) printf("\t\tcreation_txg = %llu\n",
            (u_longlong_t)ds->ds_creation_txg);
@@ -1049,6 +1051,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
        dump_zap,               /* DSL scrub queue              */
        dump_zap,               /* ZFS user/group used          */
        dump_zap,               /* ZFS user/group quota         */
+       dump_zap,               /* snapshot refcount tags       */
 };
 
 static void
index ca5c2b2..04dd2bd 100644 (file)
@@ -362,7 +362,7 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
        cb.cb_types = types;
        cb.cb_depth_limit = limit;
        /*
-        * If cb_proplist is provided then in the zfs_handles created  we
+        * If cb_proplist is provided then in the zfs_handles created we
         * retain only those properties listed in cb_proplist and sortcol.
         * The rest are pruned. So, the caller should make sure that no other
         * properties other than those listed in cb_proplist/sortcol are
index 0752a47..1fbd8bc 100644 (file)
@@ -80,6 +80,8 @@ static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
 static int zfs_do_userspace(int argc, char **argv);
 static int zfs_do_python(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
 
 /*
  * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
@@ -121,7 +123,10 @@ typedef enum {
        HELP_ALLOW,
        HELP_UNALLOW,
        HELP_USERSPACE,
-       HELP_GROUPSPACE
+       HELP_GROUPSPACE,
+       HELP_HOLD,
+       HELP_HOLDS,
+       HELP_RELEASE
 } zfs_help_t;
 
 typedef struct zfs_command {
@@ -169,6 +174,10 @@ static zfs_command_t command_table[] = {
        { "allow",      zfs_do_python,          HELP_ALLOW              },
        { NULL },
        { "unallow",    zfs_do_python,          HELP_UNALLOW            },
+       { NULL },
+       { "hold",       zfs_do_hold,            HELP_HOLD               },
+       { "holds",      zfs_do_python,          HELP_HOLDS              },
+       { "release",    zfs_do_release,         HELP_RELEASE            },
 };
 
 #define        NCOMMAND        (sizeof (command_table) / sizeof (command_table[0]))
@@ -189,7 +198,8 @@ get_usage(zfs_help_t idx)
                    "-V <size> <volume>\n"));
        case HELP_DESTROY:
                return (gettext("\tdestroy [-rRf] "
-                   "<filesystem|volume|snapshot>\n"));
+                   "<filesystem|volume|snapshot>\n"
+                   "\tdestroy -d [-r] <filesystem|volume|snapshot>\n"));
        case HELP_GET:
                return (gettext("\tget [-rHp] [-d max] "
                    "[-o field[,...]] [-s source[,...]]\n"
@@ -236,7 +246,7 @@ get_usage(zfs_help_t idx)
                return (gettext("\tunmount [-f] "
                    "<-a | filesystem|mountpoint>\n"));
        case HELP_UNSHARE:
-               return (gettext("\tunshare [-f] "
+               return (gettext("\tunshare "
                    "<-a | filesystem|mountpoint>\n"));
        case HELP_ALLOW:
                return (gettext("\tallow <filesystem|volume>\n"
@@ -266,6 +276,12 @@ get_usage(zfs_help_t idx)
                return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
                    "[-sS field] ... [-t type[,...]]\n"
                    "\t    <filesystem|snapshot>\n"));
+       case HELP_HOLD:
+               return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+       case HELP_HOLDS:
+               return (gettext("\tholds [-r] <snapshot> ...\n"));
+       case HELP_RELEASE:
+               return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
        }
 
        abort();
@@ -769,11 +785,13 @@ badusage:
 }
 
 /*
- * zfs destroy [-rf] <fs, snap, vol>
+ * zfs destroy [-rRf] <fs, snap, vol>
+ * zfs destroy -d [-r] <fs, snap, vol>
  *
  *     -r      Recursively destroy all children
  *     -R      Recursively destroy all dependents, including clones
  *     -f      Force unmounting of any dependents
+ *     -d      If we can't destroy now, mark for deferred destruction
  *
  * Destroys the given dataset.  By default, it will unmount any filesystems,
  * and refuse to destroy a dataset that has any dependents.  A dependent can
@@ -789,6 +807,7 @@ typedef struct destroy_cbdata {
        boolean_t       cb_closezhp;
        zfs_handle_t    *cb_target;
        char            *cb_snapname;
+       boolean_t       cb_defer_destroy;
 } destroy_cbdata_t;
 
 /*
@@ -869,7 +888,7 @@ destroy_callback(zfs_handle_t *zhp, void *data)
         * Bail out on the first error.
         */
        if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
-           zfs_destroy(zhp) != 0) {
+           zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) {
                zfs_close(zhp);
                return (-1);
        }
@@ -923,8 +942,11 @@ zfs_do_destroy(int argc, char **argv)
        char *cp;
 
        /* check options */
-       while ((c = getopt(argc, argv, "frR")) != -1) {
+       while ((c = getopt(argc, argv, "dfrR")) != -1) {
                switch (c) {
+               case 'd':
+                       cb.cb_defer_destroy = B_TRUE;
+                       break;
                case 'f':
                        cb.cb_force = 1;
                        break;
@@ -956,6 +978,9 @@ zfs_do_destroy(int argc, char **argv)
                usage(B_FALSE);
        }
 
+       if (cb.cb_defer_destroy && cb.cb_doclones)
+               usage(B_FALSE);
+
        /*
         * If we are doing recursive destroy of a snapshot, then the
         * named snapshot may not exist.  Go straight to libzfs.
@@ -977,7 +1002,7 @@ zfs_do_destroy(int argc, char **argv)
                        }
                }
 
-               ret = zfs_destroy_snaps(zhp, cp);
+               ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy);
                zfs_close(zhp);
                if (ret) {
                        (void) fprintf(stderr,
@@ -986,7 +1011,6 @@ zfs_do_destroy(int argc, char **argv)
                return (ret != 0);
        }
 
-
        /* Open the given dataset */
        if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
                return (1);
@@ -1014,15 +1038,15 @@ zfs_do_destroy(int argc, char **argv)
         * Check for any dependents and/or clones.
         */
        cb.cb_first = B_TRUE;
-       if (!cb.cb_doclones &&
+       if (!cb.cb_doclones && !cb.cb_defer_destroy &&
            zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
            &cb) != 0) {
                zfs_close(zhp);
                return (1);
        }
 
-       if (cb.cb_error ||
-           zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
+       if (cb.cb_error || (!cb.cb_defer_destroy &&
+           (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) {
                zfs_close(zhp);
                return (1);
        }
@@ -1035,7 +1059,6 @@ zfs_do_destroy(int argc, char **argv)
        if (destroy_callback(zhp, &cb) != 0)
                return (1);
 
-
        return (0);
 }
 
@@ -1613,7 +1636,7 @@ zfs_do_upgrade(int argc, char **argv)
                (void) printf(gettext(" 1   Initial ZFS filesystem version\n"));
                (void) printf(gettext(" 2   Enhanced directory entries\n"));
                (void) printf(gettext(" 3   Case insensitive and File system "
-                   "unique identifer (FUID)\n"));
+                   "unique identifier (FUID)\n"));
                (void) printf(gettext(" 4   userquota, groupquota "
                    "properties\n"));
                (void) printf(gettext("\nFor more information on a particular "
@@ -2651,6 +2674,108 @@ zfs_do_receive(int argc, char **argv)
        return (err != 0);
 }
 
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+       int errors = 0;
+       int i;
+       const char *tag;
+       boolean_t recursive = B_FALSE;
+       int c;
+       int (*func)(zfs_handle_t *, const char *, const char *, boolean_t);
+
+       /* check options */
+       while ((c = getopt(argc, argv, "r")) != -1) {
+               switch (c) {
+               case 'r':
+                       recursive = B_TRUE;
+                       break;
+               case '?':
+                       (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+                           optopt);
+                       usage(B_FALSE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       /* check number of arguments */
+       if (argc < 2)
+               usage(B_FALSE);
+
+       tag = argv[0];
+       --argc;
+       ++argv;
+
+       if (holding) {
+               if (tag[0] == '.') {
+                       /* tags starting with '.' are reserved for libzfs */
+                       (void) fprintf(stderr,
+                           gettext("tag may not start with '.'\n"));
+                       usage(B_FALSE);
+               }
+               func = zfs_hold;
+       } else {
+               func = zfs_release;
+       }
+
+       for (i = 0; i < argc; ++i) {
+               zfs_handle_t *zhp;
+               char parent[ZFS_MAXNAMELEN];
+               const char *delim;
+               char *path = argv[i];
+
+               delim = strchr(path, '@');
+               if (delim == NULL) {
+                       (void) fprintf(stderr,
+                           gettext("'%s' is not a snapshot\n"), path);
+                       ++errors;
+                       continue;
+               }
+               (void) strncpy(parent, path, delim - path);
+               parent[delim - path] = '\0';
+
+               zhp = zfs_open(g_zfs, parent,
+                   ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+               if (zhp == NULL) {
+                       ++errors;
+                       continue;
+               }
+               if (func(zhp, delim+1, tag, recursive) != 0)
+                       ++errors;
+               zfs_close(zhp);
+       }
+
+       return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] <tag> <snap> ...
+ *
+ *     -r      Recursively hold
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+       return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ *     -r      Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+       return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
 typedef struct get_all_cbdata {
        zfs_handle_t    **cb_handles;
        size_t          cb_alloc;
index c8a33df..c9b092e 100644 (file)
@@ -1469,7 +1469,7 @@ show_import(nvlist_t *config)
  */
 static int
 do_import(nvlist_t *config, const char *newname, const char *mntopts,
-    int force, nvlist_t *props, boolean_t allowfaulted)
+    int force, nvlist_t *props, boolean_t do_verbatim)
 {
        zpool_handle_t *zhp;
        char *name;
@@ -1522,14 +1522,14 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
                }
        }
 
-       if (zpool_import_props(g_zfs, config, newname, props,
-           allowfaulted) != 0)
+       if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0)
                return (1);
 
        if (newname != NULL)
                name = (char *)newname;
 
-       verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL);
+       if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
+               return (1);
 
        if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
            zpool_enable_datasets(zhp, mntopts, 0) != 0) {
@@ -1566,7 +1566,8 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *       -F    Import even in the presence of faulted vdevs.  This is an
  *             intentionally undocumented option for testing purposes, and
  *             treats the pool configuration as complete, leaving any bad
- *             vdevs in the FAULTED state.
+ *             vdevs in the FAULTED state. In other words, it does verbatim
+ *             import.
  *
  *       -a    Import all pools found.
  *
@@ -1595,7 +1596,7 @@ zpool_do_import(int argc, char **argv)
        nvlist_t *found_config;
        nvlist_t *props = NULL;
        boolean_t first;
-       boolean_t allow_faulted = B_FALSE;
+       boolean_t do_verbatim = B_FALSE;
        uint64_t pool_state;
        char *cachefile = NULL;
 
@@ -1628,7 +1629,7 @@ zpool_do_import(int argc, char **argv)
                        do_force = B_TRUE;
                        break;
                case 'F':
-                       allow_faulted = B_TRUE;
+                       do_verbatim = B_TRUE;
                        break;
                case 'o':
                        if ((propval = strchr(optarg, '=')) != NULL) {
@@ -1778,7 +1779,7 @@ zpool_do_import(int argc, char **argv)
 
                        if (do_all)
                                err |= do_import(config, NULL, mntopts,
-                                   do_force, props, allow_faulted);
+                                   do_force, props, do_verbatim);
                        else
                                show_import(config);
                } else if (searchname != NULL) {
@@ -1826,7 +1827,7 @@ zpool_do_import(int argc, char **argv)
                        err = B_TRUE;
                } else {
                        err |= do_import(found_config, argc == 1 ? NULL :
-                           argv[1], mntopts, do_force, props, allow_faulted);
+                           argv[1], mntopts, do_force, props, do_verbatim);
                }
        }
 
@@ -3117,6 +3118,17 @@ status_callback(zpool_handle_t *zhp, void *data)
                    "replace'.\n"));
                break;
 
+       case ZPOOL_STATUS_REMOVED_DEV:
+               (void) printf(gettext("status: One or more devices has "
+                   "been removed by the administrator.\n\tSufficient "
+                   "replicas exist for the pool to continue functioning in "
+                   "a\n\tdegraded state.\n"));
+               (void) printf(gettext("action: Online the device using "
+                   "'zpool online' or replace the device with\n\t'zpool "
+                   "replace'.\n"));
+               break;
+
+
        case ZPOOL_STATUS_RESILVERING:
                (void) printf(gettext("status: One or more devices is "
                    "currently being resilvered.  The pool will\n\tcontinue "
@@ -3539,6 +3551,8 @@ zpool_do_upgrade(int argc, char **argv)
                (void) printf(gettext(" 14  passthrough-x aclinherit\n"));
                (void) printf(gettext(" 15  user/group space accounting\n"));
                (void) printf(gettext(" 16  stmf property support\n"));
+               (void) printf(gettext(" 17  Triple-parity RAID-Z\n"));
+               (void) printf(gettext(" 18  snapshot user holds\n"));
                (void) printf(gettext("For more information on a particular "
                    "version, including supported releases, see:\n\n"));
                (void) printf("http://www.opensolaris.org/os/community/zfs/"
@@ -3624,6 +3638,8 @@ char *hist_event_table[LOG_END] = {
        "refquota set",
        "refreservation set",
        "pool scrub done",
+       "user hold",
+       "user release",
 };
 
 /*
index 10007c1..6215191 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,6 +67,7 @@
 #include <libdiskmgt.h>
 #include <libintl.h>
 #include <libnvpair.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
@@ -1093,20 +1094,35 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
 }
 
 static const char *
-is_grouping(const char *type, int *mindev)
+is_grouping(const char *type, int *mindev, int *maxdev)
 {
-       if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
-               if (mindev != NULL)
-                       *mindev = 2;
-               return (VDEV_TYPE_RAIDZ);
-       }
+       if (strncmp(type, "raidz", 5) == 0) {
+               const char *p = type + 5;
+               char *end;
+               long nparity;
+
+               if (*p == '\0') {
+                       nparity = 1;
+               } else if (*p == '0') {
+                       return (NULL); /* no zero prefixes allowed */
+               } else {
+                       errno = 0;
+                       nparity = strtol(p, &end, 10);
+                       if (errno != 0 || nparity < 1 || nparity >= 255 ||
+                           *end != '\0')
+                               return (NULL);
+               }
 
-       if (strcmp(type, "raidz2") == 0) {
                if (mindev != NULL)
-                       *mindev = 3;
+                       *mindev = nparity + 1;
+               if (maxdev != NULL)
+                       *maxdev = 255;
                return (VDEV_TYPE_RAIDZ);
        }
 
+       if (maxdev != NULL)
+               *maxdev = INT_MAX;
+
        if (strcmp(type, "mirror") == 0) {
                if (mindev != NULL)
                        *mindev = 2;
@@ -1144,7 +1160,7 @@ nvlist_t *
 construct_spec(int argc, char **argv)
 {
        nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
-       int t, toplevels, mindev, nspares, nlogs, nl2cache;
+       int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
        const char *type;
        uint64_t is_log;
        boolean_t seen_logs;
@@ -1166,7 +1182,7 @@ construct_spec(int argc, char **argv)
                 * If it's a mirror or raidz, the subsequent arguments are
                 * its leaves -- until we encounter the next mirror or raidz.
                 */
-               if ((type = is_grouping(argv[0], &mindev)) != NULL) {
+               if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
                        nvlist_t **child = NULL;
                        int c, children = 0;
 
@@ -1223,7 +1239,7 @@ construct_spec(int argc, char **argv)
                        }
 
                        for (c = 1; c < argc; c++) {
-                               if (is_grouping(argv[c], NULL) != NULL)
+                               if (is_grouping(argv[c], NULL, NULL) != NULL)
                                        break;
                                children++;
                                child = realloc(child,
@@ -1243,6 +1259,13 @@ construct_spec(int argc, char **argv)
                                return (NULL);
                        }
 
+                       if (children > maxdev) {
+                               (void) fprintf(stderr, gettext("invalid vdev "
+                                   "specification: %s supports no more than "
+                                   "%d devices\n"), argv[0], maxdev);
+                               return (NULL);
+                       }
+
                        argc -= c;
                        argv += c;
 
index 746db0c..5f49fd5 100644 (file)
@@ -479,7 +479,7 @@ process_options(int argc, char **argv)
                        zopt_raidz = MAX(1, value);
                        break;
                case 'R':
-                       zopt_raidz_parity = MIN(MAX(value, 1), 2);
+                       zopt_raidz_parity = MIN(MAX(value, 1), 3);
                        break;
                case 'd':
                        zopt_datasets = MAX(1, value);
@@ -1387,7 +1387,7 @@ ztest_destroy_cb(char *name, void *arg)
        /*
         * Destroy the dataset.
         */
-       error = dmu_objset_destroy(name);
+       error = dmu_objset_destroy(name, B_FALSE);
        if (error) {
                (void) dmu_objset_open(name, DMU_OST_OTHER,
                    DS_MODE_USER | DS_MODE_READONLY, &os);
@@ -1560,7 +1560,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
        zil_close(zilog);
        dmu_objset_close(os);
 
-       error = dmu_objset_destroy(name);
+       error = dmu_objset_destroy(name, B_FALSE);
        if (error)
                fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
 
@@ -1583,7 +1583,7 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
        (void) snprintf(snapname, 100, "%s@%llu", osname,
            (u_longlong_t)za->za_instance);
 
-       error = dmu_objset_destroy(snapname);
+       error = dmu_objset_destroy(snapname, B_FALSE);
        if (error != 0 && error != ENOENT)
                fatal(0, "dmu_objset_destroy() = %d", error);
        error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
@@ -1614,19 +1614,19 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
        (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
        (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
 
-       error = dmu_objset_destroy(clone2name);
+       error = dmu_objset_destroy(clone2name, B_FALSE);
        if (error && error != ENOENT)
                fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
-       error = dmu_objset_destroy(snap3name);
+       error = dmu_objset_destroy(snap3name, B_FALSE);
        if (error && error != ENOENT)
                fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
-       error = dmu_objset_destroy(snap2name);
+       error = dmu_objset_destroy(snap2name, B_FALSE);
        if (error && error != ENOENT)
                fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
-       error = dmu_objset_destroy(clone1name);
+       error = dmu_objset_destroy(clone1name, B_FALSE);
        if (error && error != ENOENT)
                fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
-       error = dmu_objset_destroy(snap1name);
+       error = dmu_objset_destroy(snap1name, B_FALSE);
        if (error && error != ENOENT)
                fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
 }
index 8d00a31..f19e398 100644 (file)
@@ -117,6 +117,8 @@ enum {
        EZFS_NOTSUP,            /* ops not supported on this dataset */
        EZFS_ACTIVE_SPARE,      /* pool has active shared spare devices */
        EZFS_UNPLAYED_LOGS,     /* log device has unplayed logs */
+       EZFS_REFTAG_RELE,       /* snapshot release: tag not found */
+       EZFS_REFTAG_HOLD,       /* snapshot hold: tag already exists */
        EZFS_UNKNOWN
 };
 
@@ -286,6 +288,7 @@ typedef enum {
        ZPOOL_STATUS_VERSION_OLDER,     /* older on-disk version */
        ZPOOL_STATUS_RESILVERING,       /* device being resilvered */
        ZPOOL_STATUS_OFFLINE_DEV,       /* device online */
+       ZPOOL_STATUS_REMOVED_DEV,       /* removed device */
 
        /*
         * Finally, the following indicates a healthy pool.
@@ -454,8 +457,8 @@ extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
     nvlist_t *);
 extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
-extern int zfs_destroy(zfs_handle_t *);
-extern int zfs_destroy_snaps(zfs_handle_t *, char *);
+extern int zfs_destroy(zfs_handle_t *, boolean_t);
+extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
@@ -463,6 +466,8 @@ extern int zfs_rename(zfs_handle_t *, const char *, boolean_t);
 extern int zfs_send(zfs_handle_t *, const char *, const char *,
     boolean_t, boolean_t, boolean_t, boolean_t, int);
 extern int zfs_promote(zfs_handle_t *);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
 
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
     uid_t rid, uint64_t space);
index 6fa1967..ff438b3 100644 (file)
@@ -508,6 +508,14 @@ change_one(zfs_handle_t *zhp, void *data)
                            &idx);
                        uu_list_insert(clp->cl_list, cn, idx);
                } else {
+                       /*
+                        * Add this child to beginning of the list. Children
+                        * below this one in the hierarchy will get added above
+                        * this one in the list. This produces a list in
+                        * reverse dataset name order.
+                        * This is necessary when the original mountpoint
+                        * is legacy or none.
+                        */
                        ASSERT(!clp->cl_alldependents);
                        verify(uu_list_insert_before(clp->cl_list,
                            uu_list_first(clp->cl_list), cn) == 0);
@@ -574,6 +582,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
        zfs_handle_t *temp;
        char property[ZFS_MAXPROPLEN];
        uu_compare_fn_t *compare = NULL;
+       boolean_t legacy = B_FALSE;
 
        if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL)
                return (NULL);
@@ -586,8 +595,19 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
        if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED ||
            prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS ||
            prop == ZFS_PROP_SHARESMB) {
-               compare = compare_mountpoints;
-               clp->cl_sorted = B_TRUE;
+
+               if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+                   property, sizeof (property),
+                   NULL, NULL, 0, B_FALSE) == 0 &&
+                   (strcmp(property, "legacy") == 0 ||
+                   strcmp(property, "none") == 0)) {
+
+                       legacy = B_TRUE;
+               }
+               if (!legacy) {
+                       compare = compare_mountpoints;
+                       clp->cl_sorted = B_TRUE;
+               }
        }
 
        clp->cl_pool = uu_list_pool_create("changelist_pool",
@@ -695,6 +715,12 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
                (void) uu_list_find(clp->cl_list, cn, NULL, &idx);
                uu_list_insert(clp->cl_list, cn, idx);
        } else {
+               /*
+                * Add the target dataset to the end of the list.
+                * The list is not really unsorted. The list will be
+                * in reverse dataset name order. This is necessary
+                * when the original mountpoint is legacy or none.
+                */
                verify(uu_list_insert_after(clp->cl_list,
                    uu_list_last(clp->cl_list), cn) == 0);
        }
@@ -703,11 +729,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
         * If the mountpoint property was previously 'legacy', or 'none',
         * record it as the behavior of changelist_postfix() will be different.
         */
-       if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) &&
-           (zfs_prop_get(zhp, prop, property, sizeof (property),
-           NULL, NULL, 0, B_FALSE) == 0 &&
-           (strcmp(property, "legacy") == 0 ||
-           strcmp(property, "none") == 0))) {
+       if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) {
                /*
                 * do not automatically mount ex-legacy datasets if
                 * we specifically set canmount to noauto
index ac91226..ab9ba6b 100644 (file)
@@ -47,6 +47,7 @@
 #include <ucred.h>
 #include <idmap.h>
 #include <aclutils.h>
+#include <directory.h>
 
 #include <sys/spa.h>
 #include <sys/zap.h>
@@ -1674,21 +1675,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
                (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
                if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
                        zcmd_free_nvlists(&zc);
-                       zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-                           "unable to get %s property"),
-                           zfs_prop_to_name(prop));
-                       return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION,
-                           dgettext(TEXT_DOMAIN, "internal error")));
+                       return (-1);
                }
                if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
                    nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
                    val) != 0) {
                        zcmd_free_nvlists(&zc);
-                       zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-                           "unable to get %s property"),
-                           zfs_prop_to_name(prop));
-                       return (zfs_error(zhp->zfs_hdl, EZFS_NOMEM,
-                           dgettext(TEXT_DOMAIN, "internal error")));
+                       return (-1);
                }
                if (zplprops)
                        nvlist_free(zplprops);
@@ -2074,6 +2067,7 @@ userquota_propname_decode(const char *propname, boolean_t zoned,
 {
        zfs_userquota_prop_t type;
        char *cp, *end;
+       char *numericsid = NULL;
        boolean_t isuser;
 
        domain[0] = '\0';
@@ -2096,33 +2090,41 @@ userquota_propname_decode(const char *propname, boolean_t zoned,
        if (strchr(cp, '@')) {
                /*
                 * It's a SID name (eg "user@domain") that needs to be
-                * turned into S-1-domainID-RID.  There should be a
-                * better way to do this, but for now just translate it
-                * to the (possibly ephemeral) uid and then back to the
-                * SID.  This is like getsidname(noresolve=TRUE).
+                * turned into S-1-domainID-RID.
                 */
-               uid_t id;
-               idmap_rid_t rid;
-               char *mapdomain;
-
+               directory_error_t e;
                if (zoned && getzoneid() == GLOBAL_ZONEID)
                        return (ENOENT);
-               if (sid_to_id(cp, isuser, &id) != 0)
+               if (isuser) {
+                       e = directory_sid_from_user_name(NULL,
+                           cp, &numericsid);
+               } else {
+                       e = directory_sid_from_group_name(NULL,
+                           cp, &numericsid);
+               }
+               if (e != NULL) {
+                       directory_error_free(e);
                        return (ENOENT);
-               if (idmap_id_to_numeric_domain_rid(id, isuser,
-                   &mapdomain, &rid) != 0)
+               }
+               if (numericsid == NULL)
                        return (ENOENT);
-               (void) strlcpy(domain, mapdomain, domainlen);
-               *ridp = rid;
-       } else if (strncmp(cp, "S-1-", 4) == 0) {
+               cp = numericsid;
+               /* will be further decoded below */
+       }
+
+       if (strncmp(cp, "S-1-", 4) == 0) {
                /* It's a numeric SID (eg "S-1-234-567-89") */
-               (void) strcpy(domain, cp);
+               (void) strlcpy(domain, cp, domainlen);
                cp = strrchr(domain, '-');
                *cp = '\0';
                cp++;
 
                errno = 0;
                *ridp = strtoull(cp, &end, 10);
+               if (numericsid) {
+                       free(numericsid);
+                       numericsid = NULL;
+               }
                if (errno != 0 || *end != '\0')
                        return (EINVAL);
        } else if (!isdigit(*cp)) {
@@ -2158,13 +2160,14 @@ userquota_propname_decode(const char *propname, boolean_t zoned,
                        if (idmap_id_to_numeric_domain_rid(id, isuser,
                            &mapdomain, &rid) != 0)
                                return (ENOENT);
-                       (void) strcpy(domain, mapdomain);
+                       (void) strlcpy(domain, mapdomain, domainlen);
                        *ridp = rid;
                } else {
                        *ridp = id;
                }
        }
 
+       ASSERT3P(numericsid, ==, NULL);
        return (0);
 }
 
@@ -2763,7 +2766,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
  * isn't mounted, and that there are no active dependents.
  */
 int
-zfs_destroy(zfs_handle_t *zhp)
+zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 {
        zfs_cmd_t zc = { 0 };
 
@@ -2787,6 +2790,7 @@ zfs_destroy(zfs_handle_t *zhp)
                zc.zc_objset_type = DMU_OST_ZFS;
        }
 
+       zc.zc_defer_destroy = defer;
        if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) {
                return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
                    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
@@ -2843,7 +2847,7 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
  * Destroys all snapshots with the given name in zhp & descendants.
  */
 int
-zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
+zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
 {
        zfs_cmd_t zc = { 0 };
        int ret;
@@ -2860,6 +2864,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
        (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+       zc.zc_defer_destroy = defer;
 
        ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc);
        if (ret != 0) {
@@ -3275,7 +3280,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data)
 
                        logstr = zhp->zfs_hdl->libzfs_log_str;
                        zhp->zfs_hdl->libzfs_log_str = NULL;
-                       cbp->cb_error |= zfs_destroy(zhp);
+                       cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
                        zhp->zfs_hdl->libzfs_log_str = logstr;
                }
        } else {
@@ -3289,7 +3294,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data)
                        zfs_close(zhp);
                        return (0);
                }
-               if (zfs_destroy(zhp) != 0)
+               if (zfs_destroy(zhp, B_FALSE) != 0)
                        cbp->cb_error = B_TRUE;
                else
                        changelist_remove(clp, zhp->zfs_name);
@@ -4089,3 +4094,79 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
 
        return (error);
 }
+
+int
+zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
+    boolean_t recursive)
+{
+       zfs_cmd_t zc = { 0 };
+       libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+       (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+       (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+       (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string));
+       zc.zc_cookie = recursive;
+
+       if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
+               char errbuf[ZFS_MAXNAMELEN+32];
+
+               /*
+                * if it was recursive, the one that actually failed will be in
+                * zc.zc_name.
+                */
+               (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+                   "cannot hold '%s@%s'"), zc.zc_name, snapname);
+               switch (errno) {
+               case ENOTSUP:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "pool must be upgraded"));
+                       return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+               case EINVAL:
+                       return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+               case EEXIST:
+                       return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+               default:
+                       return (zfs_standard_error_fmt(hdl, errno, errbuf));
+               }
+       }
+
+       return (0);
+}
+
+int
+zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
+    boolean_t recursive)
+{
+       zfs_cmd_t zc = { 0 };
+       libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+       (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+       (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+       (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string));
+       zc.zc_cookie = recursive;
+
+       if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
+               char errbuf[ZFS_MAXNAMELEN+32];
+
+               /*
+                * if it was recursive, the one that actually failed will be in
+                * zc.zc_name.
+                */
+               (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+                   "cannot release '%s@%s'"), zc.zc_name, snapname);
+               switch (errno) {
+               case ESRCH:
+                       return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
+               case ENOTSUP:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "pool must be upgraded"));
+                       return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+               case EINVAL:
+                       return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+               default:
+                       return (zfs_standard_error_fmt(hdl, errno, errbuf));
+               }
+       }
+
+       return (0);
+}
index 612a099..1ffb629 100644 (file)
@@ -113,6 +113,9 @@ fsavl_destroy(avl_tree_t *avl)
        free(avl);
 }
 
+/*
+ * Given an nvlist, produce an avl tree of snapshots, ordered by guid
+ */
 static avl_tree_t *
 fsavl_create(nvlist_t *fss)
 {
@@ -243,7 +246,9 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
                        continue;
 
                verify(nvpair_value_nvlist(elem, &propnv) == 0);
-               if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) {
+               if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
+                   prop == ZFS_PROP_REFQUOTA ||
+                   prop == ZFS_PROP_REFRESERVATION) {
                        /* these guys are modifyable, but have no source */
                        uint64_t value;
                        verify(nvlist_lookup_uint64(propnv,
@@ -274,6 +279,11 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
        }
 }
 
+/*
+ * recursively generate nvlists describing datasets.  See comment
+ * for the data structure send_data_t above for description of contents
+ * of the nvlist.
+ */
 static int
 send_iterate_fs(zfs_handle_t *zhp, void *arg)
 {
@@ -689,9 +699,20 @@ again:
 }
 
 /*
- * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL.
- * If 'doall', dump all intermediate snaps.
- * If 'replicate', dump special header and do recursively.
+ * Generate a send stream for the dataset identified by the argument zhp.
+ *
+ * The content of the send stream is the snapshot identified by
+ * 'tosnap'.  Incremental streams are requested in two ways:
+ *     - from the snapshot identified by "fromsnap" (if non-null) or
+ *     - from the origin of the dataset identified by zhp, which must
+ *      be a clone.  In this case, "fromsnap" is null and "fromorigin"
+ *      is TRUE.
+ *
+ * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
+ * uses a special header (with a version field of DMU_BACKUP_HEADER_VERSION)
+ * if "replicate" is set.  If "doall" is set, dump all the intermediate
+ * snapshots. The DMU_BACKUP_HEADER_VERSION header is used in the "doall"
+ * case too.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
@@ -900,11 +921,12 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
        if (err)
                return (err);
 
+       zc.zc_objset_type = DMU_OST_ZFS;
+       (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+
        if (tryname) {
                (void) strcpy(newname, tryname);
 
-               zc.zc_objset_type = DMU_OST_ZFS;
-               (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
                (void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
 
                if (flags.verbose) {
@@ -959,12 +981,18 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
        int err = 0;
        prop_changelist_t *clp;
        zfs_handle_t *zhp;
+       boolean_t defer = B_FALSE;
+       int spa_version;
 
        zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
        if (zhp == NULL)
                return (-1);
        clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
            flags.force ? MS_FORCE : 0);
+       if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
+           zfs_spa_version(zhp, &spa_version) == 0 &&
+           spa_version >= SPA_VERSION_USERREFS)
+               defer = B_TRUE;
        zfs_close(zhp);
        if (clp == NULL)
                return (-1);
@@ -973,12 +1001,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
                return (err);
 
        zc.zc_objset_type = DMU_OST_ZFS;
+       zc.zc_defer_destroy = defer;
        (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 
        if (flags.verbose)
                (void) printf("attempting destroy %s\n", zc.zc_name);
        err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
-
        if (err == 0) {
                if (flags.verbose)
                        (void) printf("success\n");
@@ -988,7 +1016,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
        (void) changelist_postfix(clp);
        changelist_free(clp);
 
-       if (err != 0)
+       /*
+        * Deferred destroy should always succeed. Since we can't tell
+        * if it destroyed the dataset or just marked it for deferred
+        * destroy, always do the rename just in case.
+        */
+       if (err != 0 || defer)
                err = recv_rename(hdl, name, NULL, baselen, newname, flags);
 
        return (err);
@@ -1775,11 +1808,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                        /* We can't do online recv in this case */
                        clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
                        if (clp == NULL) {
+                               zfs_close(zhp);
                                zcmd_free_nvlists(&zc);
                                return (-1);
                        }
                        if (changelist_prefix(clp) != 0) {
                                changelist_free(clp);
+                               zfs_close(zhp);
                                zcmd_free_nvlists(&zc);
                                return (-1);
                        }
@@ -1936,7 +1971,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
         * (if created, or if we tore them down to do an incremental
         * restore), and the /dev links for the new snapshot (if
         * created). Also mount any children of the target filesystem
-        * if we did an incremental receive.
+        * if we did a replication receive (indicated by stream_avl
+        * being non-NULL).
         */
        cp = strchr(zc.zc_value, '@');
        if (cp && (ioctl_err == 0 || !newfs)) {
@@ -1952,7 +1988,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                                if (err == 0 && ioctl_err == 0)
                                        err = zvol_create_link(hdl,
                                            zc.zc_value);
-                       } else if (newfs) {
+                       } else if (newfs || stream_avl) {
                                /*
                                 * Track the first/top of hierarchy fs,
                                 * for mounting and sharing later.
index c7eb04e..44faf02 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -104,6 +104,13 @@ vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
        return (state == VDEV_STATE_OFFLINE);
 }
 
+/* ARGSUSED */
+static int
+vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+{
+       return (state == VDEV_STATE_REMOVED);
+}
+
 /*
  * Detect if any leaf devices that have seen errors or could not be opened.
  */
@@ -276,6 +283,12 @@ check_status(nvlist_t *config, boolean_t isimport)
                return (ZPOOL_STATUS_OFFLINE_DEV);
 
        /*
+        * Removed device
+        */
+       if (find_vdev_problem(nvroot, vdev_removed))
+               return (ZPOOL_STATUS_REMOVED_DEV);
+
+       /*
         * Currently resilvering
         */
        if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
index 30829d5..4da0fb4 100644 (file)
@@ -213,6 +213,11 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_UNPLAYED_LOGS:
                return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
                    "logs"));
+       case EZFS_REFTAG_RELE:
+               return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
+       case EZFS_REFTAG_HOLD:
+               return (dgettext(TEXT_DOMAIN, "tag already exists on this "
+                   "dataset"));
        case EZFS_UNKNOWN:
                return (dgettext(TEXT_DOMAIN, "unknown error"));
        default:
index 6651b14..86b36a8 100644 (file)
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -26,6 +27,8 @@
 #ifndef        _SYS_FS_ZFS_H
 #define        _SYS_FS_ZFS_H
 
+#include <sys/time.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -111,6 +114,8 @@ typedef enum {
        ZFS_PROP_USEDREFRESERV,
        ZFS_PROP_USERACCOUNTING,        /* not exposed to the user */
        ZFS_PROP_STMF_SHAREINFO,        /* not exposed to the user */
+       ZFS_PROP_DEFER_DESTROY,
+       ZFS_PROP_USERREFS,
        ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -280,14 +285,16 @@ typedef enum zfs_cache_type {
 #define        SPA_VERSION_14                  14ULL
 #define        SPA_VERSION_15                  15ULL
 #define        SPA_VERSION_16                  16ULL
+#define        SPA_VERSION_17                  17ULL
+#define        SPA_VERSION_18                  18ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define        SPA_VERSION                     SPA_VERSION_16
-#define        SPA_VERSION_STRING              "16"
+#define        SPA_VERSION                     SPA_VERSION_18
+#define        SPA_VERSION_STRING              "18"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -303,7 +310,7 @@ typedef enum zfs_cache_type {
 #define        SPA_VERSION_INITIAL             SPA_VERSION_1
 #define        SPA_VERSION_DITTO_BLOCKS        SPA_VERSION_2
 #define        SPA_VERSION_SPARES              SPA_VERSION_3
-#define        SPA_VERSION_RAID              SPA_VERSION_3
+#define        SPA_VERSION_RAIDZ2              SPA_VERSION_3
 #define        SPA_VERSION_BPLIST_ACCOUNT      SPA_VERSION_3
 #define        SPA_VERSION_RAIDZ_DEFLATE       SPA_VERSION_3
 #define        SPA_VERSION_DNODE_BYTES         SPA_VERSION_3
@@ -325,6 +332,8 @@ typedef enum zfs_cache_type {
 #define        SPA_VERSION_PASSTHROUGH_X       SPA_VERSION_14
 #define        SPA_VERSION_USERSPACE           SPA_VERSION_15
 #define        SPA_VERSION_STMF_PROP           SPA_VERSION_16
+#define        SPA_VERSION_RAIDZ3              SPA_VERSION_17
+#define        SPA_VERSION_USERREFS            SPA_VERSION_18
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -601,7 +610,10 @@ typedef enum zfs_ioc {
        ZFS_IOC_SMB_ACL,
        ZFS_IOC_USERSPACE_ONE,
        ZFS_IOC_USERSPACE_MANY,
-       ZFS_IOC_USERSPACE_UPGRADE
+       ZFS_IOC_USERSPACE_UPGRADE,
+       ZFS_IOC_HOLD,
+       ZFS_IOC_RELEASE,
+       ZFS_IOC_GET_HOLDS
 } zfs_ioc_t;
 
 /*
@@ -715,6 +727,8 @@ typedef enum history_internal_events {
        LOG_DS_REFQUOTA,
        LOG_DS_REFRESERV,
        LOG_POOL_SCRUB_DONE,
+       LOG_DS_USER_HOLD,
+       LOG_DS_USER_RELEASE,
        LOG_END
 } history_internal_events_t;
 
index cdbbd83..e90cd0d 100644 (file)
@@ -61,6 +61,8 @@ typedef enum {
        ZFS_DELEG_NOTE_GROUPQUOTA,
        ZFS_DELEG_NOTE_USERUSED,
        ZFS_DELEG_NOTE_GROUPUSED,
+       ZFS_DELEG_NOTE_HOLD,
+       ZFS_DELEG_NOTE_RELEASE,
        ZFS_DELEG_NOTE_NONE
 } zfs_deleg_note_t;
 
index 2964cae..35f81b5 100644 (file)
@@ -67,6 +67,8 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
        {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
        {ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
        {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+       {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+       {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
        {NULL, ZFS_DELEG_NOTE_NONE }
 };
 
index 45730c6..5cfafea 100644 (file)
@@ -59,7 +59,7 @@ valid_char(char c)
  * Snapshot names must be made up of alphanumeric characters plus the following
  * characters:
  *
- *     [-_.:]
+ *     [-_.: ]
  */
 int
 snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
index 05d8306..6a32846 100644 (file)
@@ -235,6 +235,9 @@ zfs_prop_init(void)
        /* readonly index (boolean) properties */
        register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
            ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+       register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+           PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+           boolean_table);
 
        /* set once index properties */
        register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
@@ -286,6 +289,8 @@ zfs_prop_init(void)
        register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
            PROP_READONLY,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+       register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+           ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
 
        /* default number properties */
        register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
index 785c7c6..d864682 100644 (file)
@@ -87,6 +87,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
        {       zap_byteswap,           TRUE,   "scrub work queue"      },
        {       zap_byteswap,           TRUE,   "ZFS user/group used"   },
        {       zap_byteswap,           TRUE,   "ZFS user/group quota"  },
+       {       zap_byteswap,           TRUE,   "snapshot refcount tags"},
 };
 
 int
@@ -195,7 +196,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 
        ASSERT(length <= DMU_MAX_ACCESS);
 
-       dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+       dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
        if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
                dbuf_flags |= DB_RF_NOPREFETCH;
 
@@ -212,6 +213,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
                            os_dsl_dataset->ds_object,
                            (longlong_t)dn->dn_object, dn->dn_datablksz,
                            (longlong_t)offset, (longlong_t)length);
+                       rw_exit(&dn->dn_struct_rwlock);
                        return (EIO);
                }
                nblks = 1;
@@ -234,9 +236,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
                }
                /* initiate async i/o */
                if (read) {
-                       rw_exit(&dn->dn_struct_rwlock);
                        (void) dbuf_read(db, zio, dbuf_flags);
-                       rw_enter(&dn->dn_struct_rwlock, RW_READER);
                }
                dbp[i] = &db->db;
        }
@@ -376,56 +376,51 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
        dnode_rele(dn, FTAG);
 }
 
+/*
+ * Get the next "chunk" of file data to free.  We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state).  We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ */
 static int
-get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
 {
-       uint64_t len = *offset - limit;
-       uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
-       uint64_t subchunk =
+       uint64_t len = *start - limit;
+       uint64_t blkcnt = 0;
+       uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+       uint64_t iblkrange =
            dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
-       ASSERT(limit <= *offset);
+       ASSERT(limit <= *start);
 
-       if (len <= chunk_len) {
-               *offset = limit;
+       if (len <= iblkrange * maxblks) {
+               *start = limit;
                return (0);
        }
+       ASSERT(ISP2(iblkrange));
 
-       ASSERT(ISP2(subchunk));
-
-       while (*offset > limit) {
-               uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
-               uint64_t delta;
+       while (*start > limit && blkcnt < maxblks) {
                int err;
 
-               /* skip over allocated data */
+               /* find next allocated L1 indirect */
                err = dnode_next_offset(dn,
-                   DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
-               if (err == ESRCH)
-                       *offset = limit;
-               else if (err)
-                       return (err);
+                   DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
-               ASSERT3U(*offset, <=, initial_offset);
-               *offset = P2ALIGN(*offset, subchunk);
-               delta = initial_offset - *offset;
-               if (delta >= chunk_len) {
-                       *offset += delta - chunk_len;
+               /* if there are no more, then we are done */
+               if (err == ESRCH) {
+                       *start = limit;
                        return (0);
-               }
-               chunk_len -= delta;
-
-               /* skip over unallocated data */
-               err = dnode_next_offset(dn,
-                   DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
-               if (err == ESRCH)
-                       *offset = limit;
-               else if (err)
+               } else if (err) {
                        return (err);
+               }
+               blkcnt += 1;
 
-               if (*offset < limit)
-                       *offset = limit;
-               ASSERT3U(*offset, <, initial_offset);
+               /* reset offset to end of "next" block back */
+               *start = P2ALIGN(*start, iblkrange);
+               if (*start <= limit)
+                       *start = limit;
+               else
+                       *start -= 1;
        }
        return (0);
 }
@@ -548,7 +543,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 {
        dnode_t *dn;
        dmu_buf_t **dbp;
-       int numbufs, i, err;
+       int numbufs, err;
 
        err = dnode_hold(os->os, object, FTAG, &dn);
        if (err)
@@ -559,7 +554,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
         * block.  If we ever do the tail block optimization, we will need to
         * handle that here as well.
         */
-       if (dn->dn_datablkshift == 0) {
+       if (dn->dn_maxblkid == 0) {
                int newsz = offset > dn->dn_datablksz ? 0 :
                    MIN(size, dn->dn_datablksz - offset);
                bzero((char *)buf + newsz, size - newsz);
@@ -568,6 +563,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 
        while (size > 0) {
                uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+               int i;
 
                /*
                 * NB: we could do this block-at-a-time, but it's nice
@@ -803,9 +799,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
                if (tocpy == db->db_size)
                        dmu_buf_fill_done(db, tx);
 
-               if (err)
-                       break;
-
                offset += tocpy;
                size -= tocpy;
        }
index e962c4b..5a9d25b 100644 (file)
@@ -679,7 +679,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 }
 
 int
-dmu_objset_destroy(const char *name)
+dmu_objset_destroy(const char *name, boolean_t defer)
 {
        objset_t *os;
        int error;
@@ -696,7 +696,7 @@ dmu_objset_destroy(const char *name)
                dsl_dataset_t *ds = os->os->os_dsl_dataset;
                zil_destroy(dmu_objset_zil(os), B_FALSE);
 
-               error = dsl_dataset_destroy(ds, os);
+               error = dsl_dataset_destroy(ds, os, defer);
                /*
                 * dsl_dataset_destroy() closes the ds.
                 */
@@ -1130,7 +1130,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
         */
 
        for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
-               dmu_tx_t *tx = dmu_tx_create(os);
+               dmu_tx_t *tx;
                dmu_buf_t *db;
                int objerr;
 
@@ -1140,6 +1140,7 @@ dmu_objset_userspace_upgrade(objset_t *os)
                objerr = dmu_bonus_hold(os, obj, FTAG, &db);
                if (objerr)
                        continue;
+               tx = dmu_tx_create(os);
                dmu_tx_hold_bonus(tx, obj);
                objerr = dmu_tx_assign(tx, TXG_WAIT);
                if (objerr) {
index 9ca3999..ce59aac 100644 (file)
@@ -393,6 +393,7 @@ recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
        dsl_dataset_t *ds = arg1;
        struct recvbeginsyncarg *rbsa = arg2;
        int err;
+       struct dsl_ds_destroyarg dsda = {0};
 
        /* must be a head ds */
        if (ds->ds_phys->ds_next_snap_obj != 0)
@@ -402,7 +403,8 @@ recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (dsl_dir_is_clone(ds->ds_dir))
                return (EINVAL);
 
-       err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
+       dsda.ds = ds;
+       err = dsl_dataset_destroy_check(&dsda, rbsa->tag, tx);
        if (err)
                return (err);
 
@@ -427,13 +429,16 @@ recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        dsl_dir_t *dd = ds->ds_dir;
        uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
        uint64_t dsobj;
+       struct dsl_ds_destroyarg dsda = {0};
 
        /*
         * NB: caller must provide an extra hold on the dsl_dir_t, so it
         * won't go away when dsl_dataset_destroy_sync() closes the
         * dataset.
         */
-       dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
+       dsda.ds = ds;
+       dsl_dataset_destroy_sync(&dsda, rbsa->tag, cr, tx);
+       ASSERT3P(dsda.rm_origin, ==, NULL);
 
        dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
 
@@ -483,7 +488,7 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
        dsl_dataset_t *ohds = arg1;
        struct recvbeginsyncarg *rbsa = arg2;
@@ -513,27 +518,13 @@ recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
            dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
 }
 
-/* ARGSUSED */
-static void
-recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-       dsl_dataset_t *ds = arg1;
-
-       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-       ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
-       spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
-           ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
-           ds->ds_object);
-}
-
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
+    boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
 {
        int err = 0;
        boolean_t byteswap;
@@ -582,36 +573,8 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
        /*
         * Process the begin in syncing context.
         */
-       if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
-               /* offline incremental receive */
-               err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
-               if (err)
-                       return (err);
-
-               /*
-                * Only do the rollback if the most recent snapshot
-                * matches the incremental source
-                */
-               if (force) {
-                       if (ds->ds_prev == NULL ||
-                           ds->ds_prev->ds_phys->ds_guid !=
-                           rbsa.fromguid) {
-                               dsl_dataset_disown(ds, dmu_recv_tag);
-                               return (ENODEV);
-                       }
-                       (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
-               }
-               rbsa.force = B_FALSE;
-               err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-                   recv_incremental_check,
-                   recv_offline_incremental_sync, ds, &rbsa, 1);
-               if (err) {
-                       dsl_dataset_disown(ds, dmu_recv_tag);
-                       return (err);
-               }
-               drc->drc_logical_ds = drc->drc_real_ds = ds;
-       } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
-               /* online incremental receive */
+       if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
+               /* incremental receive */
 
                /* tmp clone name is: tofs/%tosnap" */
                (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
@@ -622,11 +585,18 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
                if (err)
                        return (err);
 
+               /* must not have an incremental recv already in progress */
+               if (!mutex_tryenter(&ds->ds_recvlock)) {
+                       dsl_dataset_rele(ds, dmu_recv_tag);
+                       return (EBUSY);
+               }
+
                rbsa.force = force;
                err = dsl_sync_task_do(ds->ds_dir->dd_pool,
                    recv_incremental_check,
-                   recv_online_incremental_sync, ds, &rbsa, 5);
+                   recv_incremental_sync, ds, &rbsa, 5);
                if (err) {
+                       mutex_exit(&ds->ds_recvlock);
                        dsl_dataset_rele(ds, dmu_recv_tag);
                        return (err);
                }
@@ -931,26 +901,6 @@ restore_free(struct restorearg *ra, objset_t *os,
        return (err);
 }
 
-void
-dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
-{
-       if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
-               /*
-                * online incremental or new fs: destroy the fs (which
-                * may be a clone) that we created
-                */
-               (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
-               if (drc->drc_real_ds != drc->drc_logical_ds)
-                       dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
-       } else {
-               /*
-                * offline incremental: rollback to most recent snapshot.
-                */
-               (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
-               dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
-       }
-}
-
 /*
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
@@ -1078,11 +1028,17 @@ out:
 
        if (ra.err != 0) {
                /*
-                * rollback or destroy what we created, so we don't
-                * leave it in the restoring state.
+                * destroy what we created, so we don't leave it in the
+                * inconsistent restoring state.
                 */
                txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
-               dmu_recv_abort_cleanup(drc);
+
+               (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+                   B_FALSE);
+               if (drc->drc_real_ds != drc->drc_logical_ds) {
+                       mutex_exit(&drc->drc_logical_ds->ds_recvlock);
+                       dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+               }
        }
 
        kmem_free(ra.buf, ra.bufsize);
@@ -1149,7 +1105,9 @@ dmu_recv_end(dmu_recv_cookie_t *drc)
                        dsl_dataset_rele(ds, dmu_recv_tag);
                }
                /* dsl_dataset_destroy() will disown the ds */
-               (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+               (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+                   B_FALSE);
+               mutex_exit(&drc->drc_logical_ds->ds_recvlock);
                if (err)
                        return (err);
        }
@@ -1163,7 +1121,8 @@ dmu_recv_end(dmu_recv_cookie_t *drc)
        if (err) {
                if (drc->drc_newfs) {
                        ASSERT(ds == drc->drc_real_ds);
-                       (void) dsl_dataset_destroy(ds, dmu_recv_tag);
+                       (void) dsl_dataset_destroy(ds, dmu_recv_tag,
+                           B_FALSE);
                        return (err);
                } else {
                        (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
index af2b049..c6fbeee 100644 (file)
@@ -697,8 +697,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
        }
 
        err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
-           &txh->txh_space_towrite, &txh->txh_space_tooverwrite,
-           txh->txh_dnode->dn_datablkshift);
+           &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
        /*
         * If the modified blocks are scattered to the four winds,
index cf49b97..d82e72a 100644 (file)
@@ -1260,6 +1260,22 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
        dmu_tx_willuse_space(tx, space);
 }
 
+/*
+ * This function scans a block at the indicated "level" looking for
+ * a hole or data (depending on 'flags').  If level > 0, then we are
+ * scanning an indirect block looking at its pointers.  If level == 0,
+ * then we are looking at a block of dnodes.  If we don't find what we
+ * are looking for in the block, we return ESRCH.  Otherwise, return
+ * with *offset pointing to the beginning (if searching forwards) or
+ * end (if searching backwards) of the range covered by the block
+ * pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        int lvl, uint64_t blkfill, uint64_t txg)
@@ -1330,6 +1346,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                        error = ESRCH;
        } else {
                blkptr_t *bp = data;
+               uint64_t start = *offset;
                span = (lvl - 1) * epbs + dn->dn_datablkshift;
                minfill = 0;
                maxfill = blkfill << ((lvl - 1) * epbs);
@@ -1339,18 +1356,25 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                else
                        minfill++;
 
-               for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+               *offset = *offset >> span;
+               for (i = BF64_GET(*offset, 0, epbs);
                    i >= 0 && i < epb; i += inc) {
                        if (bp[i].blk_fill >= minfill &&
                            bp[i].blk_fill <= maxfill &&
                            (hole || bp[i].blk_birth > txg))
                                break;
-                       if (inc < 0 && *offset < (1ULL << span))
-                               *offset = 0;
-                       else
-                               *offset += (1ULL << span) * inc;
+                       if (inc > 0 || *offset > 0)
+                               *offset += inc;
+               }
+               *offset = *offset << span;
+               if (inc < 0) {
+                       /* traversing backwards; position offset at the end */
+                       ASSERT3U(*offset, <=, start);
+                       *offset = MIN(*offset + (1ULL << span) - 1, start);
+               } else if (*offset < start) {
+                       *offset = start;
                }
-               if (i < 0 || i == epb)
+               if (i < 0 || i >= epb)
                        error = ESRCH;
        }
 
index 0fe7eb5..edc36e7 100644 (file)
@@ -39,6 +39,7 @@
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
 #include <sys/sunddi.h>
+#include <sys/zvol.h>
 
 static char *dsl_reaper = "the grim reaper";
 
@@ -262,6 +263,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
        ASSERT(!list_link_active(&ds->ds_synced_link));
 
        mutex_destroy(&ds->ds_lock);
+       mutex_destroy(&ds->ds_recvlock);
        mutex_destroy(&ds->ds_opening_lock);
        mutex_destroy(&ds->ds_deadlist.bpl_lock);
        rw_destroy(&ds->ds_rwlock);
@@ -359,6 +361,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                ds->ds_phys = dbuf->db_data;
 
                mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
                mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
                mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
                    NULL);
@@ -377,6 +380,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                         * just opened it.
                         */
                        mutex_destroy(&ds->ds_lock);
+                       mutex_destroy(&ds->ds_recvlock);
                        mutex_destroy(&ds->ds_opening_lock);
                        mutex_destroy(&ds->ds_deadlist.bpl_lock);
                        rw_destroy(&ds->ds_rwlock);
@@ -406,8 +410,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                                        dsl_dataset_rele(origin, FTAG);
                                }
                        }
-               } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
-                       err = dsl_dataset_get_snapname(ds);
+               } else {
+                       if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+                               err = dsl_dataset_get_snapname(ds);
+                       if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+                               err = zap_count(
+                                   ds->ds_dir->dd_pool->dp_meta_objset,
+                                   ds->ds_phys->ds_userrefs_obj,
+                                   &ds->ds_userrefs);
+                       }
                }
 
                if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
@@ -448,6 +459,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                                dsl_dataset_drop_ref(ds->ds_prev, ds);
                        dsl_dir_close(ds->ds_dir, ds);
                        mutex_destroy(&ds->ds_lock);
+                       mutex_destroy(&ds->ds_recvlock);
                        mutex_destroy(&ds->ds_opening_lock);
                        mutex_destroy(&ds->ds_deadlist.bpl_lock);
                        rw_destroy(&ds->ds_rwlock);
@@ -845,6 +857,7 @@ struct destroyarg {
        dsl_sync_task_group_t *dstg;
        char *snapname;
        char *failed;
+       boolean_t defer;
 };
 
 static int
@@ -852,23 +865,30 @@ dsl_snapshot_destroy_one(char *name, void *arg)
 {
        struct destroyarg *da = arg;
        dsl_dataset_t *ds;
-       char *cp;
        int err;
-
-       (void) strcat(name, "@");
-       (void) strcat(name, da->snapname);
-       err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+       char *dsname;
+       size_t buflen;
+
+       /* alloc a buffer to hold name@snapname, plus the terminating NULL */
+       buflen = strlen(name) + strlen(da->snapname) + 2;
+       dsname = kmem_alloc(buflen, KM_SLEEP);
+       (void) snprintf(dsname, buflen, "%s@%s", name, da->snapname);
+       err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
            da->dstg, &ds);
-       cp = strchr(name, '@');
-       *cp = '\0';
+       kmem_free(dsname, buflen);
        if (err == 0) {
+               struct dsl_ds_destroyarg *dsda;
+
                dsl_dataset_make_exclusive(ds, da->dstg);
                if (ds->ds_user_ptr) {
                        ds->ds_user_evict_func(ds, ds->ds_user_ptr);
                        ds->ds_user_ptr = NULL;
                }
+               dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
+               dsda->ds = ds;
+               dsda->defer = da->defer;
                dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-                   dsl_dataset_destroy_sync, ds, da->dstg, 0);
+                   dsl_dataset_destroy_sync, dsda, da->dstg, 0);
        } else if (err == ENOENT) {
                err = 0;
        } else {
@@ -882,7 +902,7 @@ dsl_snapshot_destroy_one(char *name, void *arg)
  */
 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
 int
-dsl_snapshots_destroy(char *fsname, char *snapname)
+dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
 {
        int err;
        struct destroyarg da;
@@ -895,6 +915,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
        da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
        da.snapname = snapname;
        da.failed = fsname;
+       da.defer = defer;
 
        err = dmu_objset_find(fsname,
            dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
@@ -904,7 +925,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 
        for (dst = list_head(&da.dstg->dstg_tasks); dst;
            dst = list_next(&da.dstg->dstg_tasks, dst)) {
-               dsl_dataset_t *ds = dst->dst_arg1;
+               struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
+               dsl_dataset_t *ds = dsda->ds;
+
                /*
                 * Return the file system name that triggered the error
                 */
@@ -912,7 +935,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
                        dsl_dataset_name(ds, fsname);
                        *strchr(fsname, '@') = '\0';
                }
+               ASSERT3P(dsda->rm_origin, ==, NULL);
                dsl_dataset_disown(ds, da.dstg);
+               kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
        }
 
        dsl_sync_task_group_destroy(da.dstg);
@@ -920,18 +945,100 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
        return (err);
 }
 
+static boolean_t
+dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
+{
+       boolean_t might_destroy = B_FALSE;
+
+       mutex_enter(&ds->ds_lock);
+       if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
+           DS_IS_DEFER_DESTROY(ds))
+               might_destroy = B_TRUE;
+       mutex_exit(&ds->ds_lock);
+
+       return (might_destroy);
+}
+
+#ifdef _KERNEL
+static int
+dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name)
+{
+       int error;
+       objset_t *os;
+
+       error = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+       if (error)
+               return (error);
+
+       if (dmu_objset_type(os) == DMU_OST_ZVOL)
+               error = zvol_remove_minor(name);
+       dmu_objset_close(os);
+
+       return (error);
+}
+#endif
+
+/*
+ * If we're removing a clone, and these three conditions are true:
+ *     1) the clone's origin has no other children
+ *     2) the clone's origin has no user references
+ *     3) the clone's origin has been marked for deferred destruction
+ * Then, prepare to remove the origin as part of this sync task group.
+ */
+static int
+dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+{
+       dsl_dataset_t *ds = dsda->ds;
+       dsl_dataset_t *origin = ds->ds_prev;
+
+       if (dsl_dataset_might_destroy_origin(origin)) {
+               char *name;
+               int namelen;
+               int error;
+
+               namelen = dsl_dataset_namelen(origin) + 1;
+               name = kmem_alloc(namelen, KM_SLEEP);
+               dsl_dataset_name(origin, name);
+#ifdef _KERNEL
+               error = zfs_unmount_snap(name, NULL);
+               if (error) {
+                       kmem_free(name, namelen);
+                       return (error);
+               }
+               error = dsl_dataset_zvol_cleanup(origin, name);
+               if (error) {
+                       kmem_free(name, namelen);
+                       return (error);
+               }
+#endif
+               error = dsl_dataset_own(name,
+                   DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+                   tag, &origin);
+               kmem_free(name, namelen);
+               if (error)
+                       return (error);
+               dsda->rm_origin = origin;
+               dsl_dataset_make_exclusive(origin, tag);
+       }
+
+       return (0);
+}
+
 /*
  * ds must be opened as OWNER.  On return (whether successful or not),
  * ds will be closed and caller can no longer dereference it.
  */
 int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
 {
        int err;
        dsl_sync_task_group_t *dstg;
        objset_t *os;
        dsl_dir_t *dd;
        uint64_t obj;
+       struct dsl_ds_destroyarg dsda = {0};
+
+       dsda.ds = ds;
 
        if (dsl_dataset_is_snapshot(ds)) {
                /* Destroying a snapshot is simpler */
@@ -941,9 +1048,12 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
                        ds->ds_user_evict_func(ds, ds->ds_user_ptr);
                        ds->ds_user_ptr = NULL;
                }
+               /* NOTE: defer is always B_FALSE for non-snapshots */
+               dsda.defer = defer;
                err = dsl_sync_task_do(ds->ds_dir->dd_pool,
                    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-                   ds, tag, 0);
+                   &dsda, tag, 0);
+               ASSERT3P(dsda.rm_origin, ==, NULL);
                goto out;
        }
 
@@ -1024,13 +1134,45 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
                ds->ds_user_evict_func(ds, ds->ds_user_ptr);
                ds->ds_user_ptr = NULL;
        }
-       dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
-       dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-           dsl_dataset_destroy_sync, ds, tag, 0);
-       dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-           dsl_dir_destroy_sync, dd, FTAG, 0);
-       err = dsl_sync_task_group_wait(dstg);
-       dsl_sync_task_group_destroy(dstg);
+
+       /*
+        * If we're removing a clone, we might also need to remove its
+        * origin.
+        */
+       do {
+               dsda.need_prep = B_FALSE;
+               if (dsl_dir_is_clone(dd)) {
+                       err = dsl_dataset_origin_rm_prep(&dsda, tag);
+                       if (err) {
+                               dsl_dir_close(dd, FTAG);
+                               goto out;
+                       }
+               }
+
+               dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+               dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+                   dsl_dataset_destroy_sync, &dsda, tag, 0);
+               dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+                   dsl_dir_destroy_sync, dd, FTAG, 0);
+               err = dsl_sync_task_group_wait(dstg);
+               dsl_sync_task_group_destroy(dstg);
+
+               /*
+                * We could be racing against 'zfs release' or 'zfs destroy -d'
+                * on the origin snap, in which case we can get EBUSY if we
+                * needed to destroy the origin snap but were not ready to
+                * do so.
+                */
+               if (dsda.need_prep) {
+                       ASSERT(err == EBUSY);
+                       ASSERT(dsl_dir_is_clone(dd));
+                       ASSERT(dsda.rm_origin == NULL);
+               }
+       } while (dsda.need_prep);
+
+       if (dsda.rm_origin != NULL)
+               dsl_dataset_disown(dsda.rm_origin, tag);
+
        /* if it is successful, dsl_dir_destroy_sync will close the dd */
        if (err)
                dsl_dir_close(dd, FTAG);
@@ -1211,7 +1353,8 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
        /*
         * We can only roll back to emptyness if it is a ZPL objset.
         */
-       if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
+       if (*ost != DMU_OST_ZFS &&
+           ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL)
                return (EINVAL);
 
        /*
@@ -1316,6 +1459,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
        } else {
                objset_impl_t *osi;
 
+               ASSERT(*ost != DMU_OST_ZVOL);
                ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
                ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
                ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
@@ -1385,18 +1529,63 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
            cr, "dataset = %llu", ds->ds_object);
 }
 
+static int
+dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
+    dmu_tx_t *tx)
+{
+       dsl_dataset_t *ds = dsda->ds;
+       dsl_dataset_t *ds_prev = ds->ds_prev;
+
+       if (dsl_dataset_might_destroy_origin(ds_prev)) {
+               struct dsl_ds_destroyarg ndsda = {0};
+
+               /*
+                * If we're not prepared to remove the origin, don't remove
+                * the clone either.
+                */
+               if (dsda->rm_origin == NULL) {
+                       dsda->need_prep = B_TRUE;
+                       return (EBUSY);
+               }
+
+               ndsda.ds = ds_prev;
+               ndsda.is_origin_rm = B_TRUE;
+               return (dsl_dataset_destroy_check(&ndsda, tag, tx));
+       }
+
+       /*
+        * If we're not going to remove the origin after all,
+        * undo the open context setup.
+        */
+       if (dsda->rm_origin != NULL) {
+               dsl_dataset_disown(dsda->rm_origin, tag);
+               dsda->rm_origin = NULL;
+       }
+
+       return (0);
+}
+
 /* ARGSUSED */
 int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
+       struct dsl_ds_destroyarg *dsda = arg1;
+       dsl_dataset_t *ds = dsda->ds;
 
        /* we have an owner hold, so noone else can destroy us */
        ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 
-       /* Can't delete a branch point. */
-       if (ds->ds_phys->ds_num_children > 1)
-               return (EEXIST);
+       /*
+        * Only allow deferred destroy on pools that support it.
+        * NOTE: deferred destroy is only supported on snapshots.
+        */
+       if (dsda->defer) {
+               if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+                   SPA_VERSION_USERREFS)
+                       return (ENOTSUP);
+               ASSERT(dsl_dataset_is_snapshot(ds));
+               return (0);
+       }
 
        /*
         * Can't delete a head dataset if there are snapshots of it.
@@ -1414,6 +1603,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
                return (EAGAIN);
 
+       if (dsl_dataset_is_snapshot(ds)) {
+               /*
+                * If this snapshot has an elevated user reference count,
+                * we can't destroy it yet.
+                */
+               if (ds->ds_userrefs > 0 && !dsda->releasing)
+                       return (EBUSY);
+
+               mutex_enter(&ds->ds_lock);
+               /*
+                * Can't delete a branch point. However, if we're destroying
+                * a clone and removing its origin due to it having a user
+                * hold count of 0 and having been marked for deferred destroy,
+                * it's OK for the origin to have a single clone.
+                */
+               if (ds->ds_phys->ds_num_children >
+                   (dsda->is_origin_rm ? 2 : 1)) {
+                       mutex_exit(&ds->ds_lock);
+                       return (EEXIST);
+               }
+               mutex_exit(&ds->ds_lock);
+       } else if (dsl_dir_is_clone(ds->ds_dir)) {
+               return (dsl_dataset_origin_check(dsda, arg2, tx));
+       }
+
        /* XXX we should do some i/o error checking... */
        return (0);
 }
@@ -1461,7 +1675,8 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
-       dsl_dataset_t *ds = arg1;
+       struct dsl_ds_destroyarg *dsda = arg1;
+       dsl_dataset_t *ds = dsda->ds;
        zio_t *zio;
        int err;
        int after_branch_point = FALSE;
@@ -1471,11 +1686,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
        uint64_t obj;
 
        ASSERT(ds->ds_owner);
-       ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+       ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
        ASSERT(ds->ds_prev == NULL ||
            ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
        ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
+       if (dsda->defer) {
+               ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+               if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) {
+                       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+                       ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+                       return;
+               }
+       }
+
        /* signal any waiters that this dataset is going away */
        mutex_enter(&ds->ds_lock);
        ds->ds_owner = dsl_reaper;
@@ -1521,6 +1745,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
                        /* This clone is toast. */
                        ASSERT(ds_prev->ds_phys->ds_num_children > 1);
                        ds_prev->ds_phys->ds_num_children--;
+
+                       /*
+                        * If the clone's origin has no other clones, no
+                        * user holds, and has been marked for deferred
+                        * deletion, then we should have done the necessary
+                        * destroy setup for it.
+                        */
+                       if (ds_prev->ds_phys->ds_num_children == 1 &&
+                           ds_prev->ds_userrefs == 0 &&
+                           DS_IS_DEFER_DESTROY(ds_prev)) {
+                               ASSERT3P(dsda->rm_origin, !=, NULL);
+                       } else {
+                               ASSERT3P(dsda->rm_origin, ==, NULL);
+                       }
                } else if (!after_branch_point) {
                        ds_prev->ds_phys->ds_next_snap_obj =
                            ds->ds_phys->ds_next_snap_obj;
@@ -1733,10 +1971,32 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
        }
        if (ds->ds_phys->ds_props_obj != 0)
                VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+       if (ds->ds_phys->ds_userrefs_obj != 0)
+               VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
        dsl_dir_close(ds->ds_dir, ds);
        ds->ds_dir = NULL;
        dsl_dataset_drain_refs(ds, tag);
        VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+       if (dsda->rm_origin) {
+               /*
+                * Remove the origin of the clone we just destroyed.
+                */
+               dsl_dataset_t *origin = ds->ds_prev;
+               struct dsl_ds_destroyarg ndsda = {0};
+
+               ASSERT3P(origin, ==, dsda->rm_origin);
+               if (origin->ds_user_ptr) {
+                       origin->ds_user_evict_func(origin, origin->ds_user_ptr);
+                       origin->ds_user_ptr = NULL;
+               }
+
+               dsl_dataset_rele(origin, tag);
+               ds->ds_prev = NULL;
+
+               ndsda.ds = origin;
+               dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
+       }
 }
 
 static int
@@ -1951,6 +2211,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
            ds->ds_reserved);
        dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
            ds->ds_phys->ds_guid);
+       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs);
+       dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+           DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
        if (ds->ds_phys->ds_next_snap_obj) {
                /*
@@ -3019,7 +3282,7 @@ dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
        ds->ds_quota = new_quota;
 
-       dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+       dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
 
        spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
            tx, cr, "%lld dataset = %llu ",
@@ -3114,7 +3377,7 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
 
        dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
        mutex_exit(&ds->ds_dir->dd_lock);
-       dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
+       dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refreservation",
            new_reservation, cr, tx);
 
        spa_history_internal_log(LOG_DS_REFRESERV,
@@ -3138,3 +3401,421 @@ dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
+
+static int
+dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       dsl_dataset_t *ds = arg1;
+       char *htag = arg2;
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       int error = 0;
+
+       if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+               return (ENOTSUP);
+
+       if (!dsl_dataset_is_snapshot(ds))
+               return (EINVAL);
+
+       if (strlen(htag) >= ZAP_MAXNAMELEN)
+               return (ENAMETOOLONG);
+
+       /* tags must be unique */
+       mutex_enter(&ds->ds_lock);
+       if (ds->ds_phys->ds_userrefs_obj) {
+               error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
+                   8, 1, tx);
+               if (error == 0)
+                       error = EEXIST;
+               else if (error == ENOENT)
+                       error = 0;
+       }
+       mutex_exit(&ds->ds_lock);
+
+       return (error);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+       dsl_dataset_t *ds = arg1;
+       char *htag = arg2;
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       time_t now = gethrestime_sec();
+       uint64_t zapobj;
+
+       mutex_enter(&ds->ds_lock);
+       if (ds->ds_phys->ds_userrefs_obj == 0) {
+               /*
+                * This is the first user hold for this dataset.  Create
+                * the userrefs zap object.
+                */
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               zapobj = ds->ds_phys->ds_userrefs_obj =
+                   zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+       } else {
+               zapobj = ds->ds_phys->ds_userrefs_obj;
+       }
+       ds->ds_userrefs++;
+       mutex_exit(&ds->ds_lock);
+
+       VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+       spa_history_internal_log(LOG_DS_USER_HOLD,
+           ds->ds_dir->dd_pool->dp_spa, tx, cr, "<%s> dataset = %llu",
+           htag, ds->ds_object);
+}
+
+struct dsl_ds_holdarg {
+       dsl_sync_task_group_t *dstg;
+       char *htag;
+       char *snapname;
+       boolean_t recursive;
+       char failed[MAXPATHLEN];
+};
+
+static int
+dsl_dataset_user_hold_one(char *dsname, void *arg)
+{
+       struct dsl_ds_holdarg *ha = arg;
+       dsl_dataset_t *ds;
+       int error;
+       char *name;
+       size_t buflen;
+
+       /* alloc a buffer to hold dsname@snapname plus terminating NULL */
+       buflen = strlen(dsname) + strlen(ha->snapname) + 2;
+       name = kmem_alloc(buflen, KM_SLEEP);
+       (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+       error = dsl_dataset_hold(name, ha->dstg, &ds);
+       kmem_free(name, buflen);
+       if (error == 0) {
+               dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
+                   dsl_dataset_user_hold_sync, ds, ha->htag, 0);
+       } else if (error == ENOENT && ha->recursive) {
+               error = 0;
+       } else {
+               (void) strcpy(ha->failed, dsname);
+       }
+       return (error);
+}
+
+int
+dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+    boolean_t recursive)
+{
+       struct dsl_ds_holdarg *ha;
+       dsl_sync_task_t *dst;
+       spa_t *spa;
+       int error;
+
+       ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+       (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+       error = spa_open(dsname, &spa, FTAG);
+       if (error) {
+               kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+               return (error);
+       }
+
+       ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+       ha->htag = htag;
+       ha->snapname = snapname;
+       ha->recursive = recursive;
+       if (recursive) {
+               error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
+                   ha, DS_FIND_CHILDREN);
+       } else {
+               error = dsl_dataset_user_hold_one(dsname, ha);
+       }
+       if (error == 0)
+               error = dsl_sync_task_group_wait(ha->dstg);
+
+       for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+           dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+               dsl_dataset_t *ds = dst->dst_arg1;
+
+               if (dst->dst_err) {
+                       dsl_dataset_name(ds, ha->failed);
+                       *strchr(ha->failed, '@') = '\0';
+               }
+               dsl_dataset_rele(ds, ha->dstg);
+       }
+
+       if (error)
+               (void) strcpy(dsname, ha->failed);
+
+       dsl_sync_task_group_destroy(ha->dstg);
+       kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+       spa_close(spa, FTAG);
+       return (error);
+}
+
+struct dsl_ds_releasearg {
+       dsl_dataset_t *ds;
+       const char *htag;
+       boolean_t own;          /* do we own or just hold ds? */
+};
+
+static int
+dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
+    boolean_t *might_destroy)
+{
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       uint64_t zapobj;
+       uint64_t tmp;
+       int error;
+
+       *might_destroy = B_FALSE;
+
+       mutex_enter(&ds->ds_lock);
+       zapobj = ds->ds_phys->ds_userrefs_obj;
+       if (zapobj == 0) {
+               /* The tag can't possibly exist */
+               mutex_exit(&ds->ds_lock);
+               return (ESRCH);
+       }
+
+       /* Make sure the tag exists */
+       error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
+       if (error) {
+               mutex_exit(&ds->ds_lock);
+               if (error == ENOENT)
+                       error = ESRCH;
+               return (error);
+       }
+
+       if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
+           DS_IS_DEFER_DESTROY(ds))
+               *might_destroy = B_TRUE;
+
+       mutex_exit(&ds->ds_lock);
+       return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+{
+       struct dsl_ds_releasearg *ra = arg1;
+       dsl_dataset_t *ds = ra->ds;
+       boolean_t might_destroy;
+       int error;
+
+       if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+               return (ENOTSUP);
+
+       error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
+       if (error)
+               return (error);
+
+       if (might_destroy) {
+               struct dsl_ds_destroyarg dsda = {0};
+
+               if (dmu_tx_is_syncing(tx)) {
+                       /*
+                        * If we're not prepared to remove the snapshot,
+                        * we can't allow the release to happen right now.
+                        */
+                       if (!ra->own)
+                               return (EBUSY);
+                       if (ds->ds_user_ptr) {
+                               ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+                               ds->ds_user_ptr = NULL;
+                       }
+               }
+               dsda.ds = ds;
+               dsda.releasing = B_TRUE;
+               return (dsl_dataset_destroy_check(&dsda, tag, tx));
+       }
+
+       return (0);
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+{
+       struct dsl_ds_releasearg *ra = arg1;
+       dsl_dataset_t *ds = ra->ds;
+       spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+       objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+       uint64_t zapobj;
+       uint64_t dsobj = ds->ds_object;
+       uint64_t refs;
+
+       mutex_enter(&ds->ds_lock);
+       ds->ds_userrefs--;
+       refs = ds->ds_userrefs;
+       mutex_exit(&ds->ds_lock);
+       zapobj = ds->ds_phys->ds_userrefs_obj;
+       VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
+       if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
+           DS_IS_DEFER_DESTROY(ds)) {
+               struct dsl_ds_destroyarg dsda = {0};
+
+               ASSERT(ra->own);
+               dsda.ds = ds;
+               dsda.releasing = B_TRUE;
+               /* We already did the destroy_check */
+               dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
+       }
+
+       spa_history_internal_log(LOG_DS_USER_RELEASE,
+           spa, tx, cr, "<%s> %lld dataset = %llu",
+           ra->htag, (longlong_t)refs, dsobj);
+}
+
+static int
+dsl_dataset_user_release_one(char *dsname, void *arg)
+{
+       struct dsl_ds_holdarg *ha = arg;
+       struct dsl_ds_releasearg *ra;
+       dsl_dataset_t *ds;
+       int error;
+       void *dtag = ha->dstg;
+       char *name;
+       size_t buflen;
+       boolean_t own = B_FALSE;
+       boolean_t might_destroy;
+
+       if (strlen(ha->htag) >= ZAP_MAXNAMELEN)
+               return (ENAMETOOLONG);
+
+       /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
+       buflen = strlen(dsname) + strlen(ha->snapname) + 2;
+       name = kmem_alloc(buflen, KM_SLEEP);
+       (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+       error = dsl_dataset_hold(name, dtag, &ds);
+       kmem_free(name, buflen);
+       if (error == ENOENT && ha->recursive)
+               return (0);
+       (void) strcpy(ha->failed, dsname);
+       if (error)
+               return (error);
+
+       ASSERT(dsl_dataset_is_snapshot(ds));
+
+       error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
+       if (error) {
+               dsl_dataset_rele(ds, dtag);
+               return (error);
+       }
+
+       if (might_destroy) {
+#ifdef _KERNEL
+               error = zfs_unmount_snap(name, NULL);
+               if (error) {
+                       dsl_dataset_rele(ds, dtag);
+                       return (error);
+               }
+               error = dsl_dataset_zvol_cleanup(ds, name);
+               if (error) {
+                       dsl_dataset_rele(ds, dtag);
+                       return (error);
+               }
+#endif
+               if (!dsl_dataset_tryown(ds,
+                   DS_MODE_READONLY | DS_MODE_INCONSISTENT, dtag)) {
+                       dsl_dataset_rele(ds, dtag);
+                       return (EBUSY);
+               } else {
+                       own = B_TRUE;
+                       dsl_dataset_make_exclusive(ds, dtag);
+               }
+       }
+
+       ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
+       ra->ds = ds;
+       ra->htag = ha->htag;
+       ra->own = own;
+       dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
+           dsl_dataset_user_release_sync, ra, dtag, 0);
+
+       return (0);
+}
+
+int
+dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+    boolean_t recursive)
+{
+       struct dsl_ds_holdarg *ha;
+       dsl_sync_task_t *dst;
+       spa_t *spa;
+       int error;
+
+       ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+       (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+       error = spa_open(dsname, &spa, FTAG);
+       if (error) {
+               kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+               return (error);
+       }
+
+       ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+       ha->htag = htag;
+       ha->snapname = snapname;
+       ha->recursive = recursive;
+       if (recursive) {
+               error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
+                   ha, DS_FIND_CHILDREN);
+       } else {
+               error = dsl_dataset_user_release_one(dsname, ha);
+       }
+       if (error == 0)
+               error = dsl_sync_task_group_wait(ha->dstg);
+
+       for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+           dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+               struct dsl_ds_releasearg *ra = dst->dst_arg1;
+               dsl_dataset_t *ds = ra->ds;
+
+               if (dst->dst_err)
+                       dsl_dataset_name(ds, ha->failed);
+
+               if (ra->own)
+                       dsl_dataset_disown(ds, ha->dstg);
+               else
+                       dsl_dataset_rele(ds, ha->dstg);
+
+               kmem_free(ra, sizeof (struct dsl_ds_releasearg));
+       }
+
+       if (error)
+               (void) strcpy(dsname, ha->failed);
+
+       dsl_sync_task_group_destroy(ha->dstg);
+       kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+       spa_close(spa, FTAG);
+       return (error);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+{
+       dsl_dataset_t *ds;
+       int err;
+
+       err = dsl_dataset_hold(dsname, FTAG, &ds);
+       if (err)
+               return (err);
+
+       VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
+       if (ds->ds_phys->ds_userrefs_obj != 0) {
+               zap_attribute_t *za;
+               zap_cursor_t zc;
+
+               za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+               for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+                   ds->ds_phys->ds_userrefs_obj);
+                   zap_cursor_retrieve(&zc, za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
+                           za->za_first_integer));
+               }
+               zap_cursor_fini(&zc);
+               kmem_free(za, sizeof (zap_attribute_t));
+       }
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
index da5d157..5d76ff5 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -66,8 +66,6 @@
  * The ZAP OBJ is referred to as the jump object.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
@@ -540,7 +538,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
        dsl_pool_t *dp;
        void *cookie;
        int     error;
-       char    checkflag = ZFS_DELEG_LOCAL;
+       char    checkflag;
        objset_t *mos;
        avl_tree_t permsets;
        perm_set_t *setnode;
@@ -563,6 +561,16 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
                return (EPERM);
        }
 
+       if (dsl_dataset_is_snapshot(ds)) {
+               /*
+                * Snapshots are treated as descendents only,
+                * local permissions do not apply.
+                */
+               checkflag = ZFS_DELEG_DESCENDENT;
+       } else {
+               checkflag = ZFS_DELEG_LOCAL;
+       }
+
        avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
            offsetof(perm_set_t, p_node));
 
index f19653d..2f312ae 100644 (file)
@@ -96,7 +96,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 #endif
        if (dd == NULL) {
                dsl_dir_t *winner;
-               int err;
 
                dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
                dd->dd_object = ddobj;
index 664ccff..bfc0fa8 100644 (file)
@@ -442,7 +442,7 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 void
-dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
     cred_t *cr, dmu_tx_t *tx)
 {
        objset_t *mos = dd->dd_pool->dp_meta_objset;
index 8a802b5..03ebb90 100644 (file)
@@ -1024,6 +1024,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 int
 dsl_pool_scrub_clean(dsl_pool_t *dp)
 {
+       spa_t *spa = dp->dp_spa;
+
        /*
         * Purge all vdev caches.  We do this here rather than in sync
         * context because this requires a writer lock on the spa_config
@@ -1031,11 +1033,11 @@ dsl_pool_scrub_clean(dsl_pool_t *dp)
         * spa_scrub_reopen flag indicates that vdev_open() should not
         * attempt to start another scrub.
         */
-       spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER);
-       dp->dp_spa->spa_scrub_reopen = B_TRUE;
-       vdev_reopen(dp->dp_spa->spa_root_vdev);
-       dp->dp_spa->spa_scrub_reopen = B_FALSE;
-       spa_config_exit(dp->dp_spa, SCL_ALL, FTAG);
+       spa_vdev_state_enter(spa);
+       spa->spa_scrub_reopen = B_TRUE;
+       vdev_reopen(spa->spa_root_vdev);
+       spa->spa_scrub_reopen = B_FALSE;
+       (void) spa_vdev_state_exit(spa, NULL, 0);
 
        return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
 }
index a363d70..3ff71b3 100644 (file)
@@ -117,6 +117,7 @@ typedef enum dmu_object_type {
        DMU_OT_SCRUB_QUEUE,             /* ZAP */
        DMU_OT_USERGROUP_USED,          /* ZAP */
        DMU_OT_USERGROUP_QUOTA,         /* ZAP */
+       DMU_OT_USERREFS,                /* ZAP */
        DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -174,8 +175,8 @@ int dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
 int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
     boolean_t recursive);
@@ -646,10 +647,9 @@ typedef struct dmu_recv_cookie {
 } dmu_recv_cookie_t;
 
 int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
-    boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *);
+    boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
 int dmu_recv_end(dmu_recv_cookie_t *drc);
-void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc);
 
 /* CRC64 table */
 #define        ZFS_CRC64_POLY  0xC96C5795D7870F42ULL   /* ECMA-182, reflected form */
index 96ce688..3868a58 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -211,10 +211,11 @@ extern "C" {
  * ds_lock
  *    protects:
  *     ds_user_ptr
- *     ds_user_evice_func
+ *     ds_user_evict_func
  *     ds_open_refcount
  *     ds_snapname
  *     ds_phys accounting
+ *     ds_phys userrefs zapobj
  *     ds_reserved
  *    held from:
  *     dsl_dataset_*
index 82cb6ad..052cb8d 100644 (file)
@@ -117,7 +117,7 @@ void dmu_objset_close(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
+int dmu_objset_destroy(const char *name, boolean_t defer);
 int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
     boolean_t recursive);
index a1c2896..b51036d 100644 (file)
@@ -63,6 +63,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
 #define        DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
 
 /*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define        DS_FLAG_DEFER_DESTROY   (1ULL<<3)
+#define        DS_IS_DEFER_DESTROY(ds) \
+       ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+
+/*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
  */
@@ -93,7 +101,8 @@ typedef struct dsl_dataset_phys {
        blkptr_t ds_bp;
        uint64_t ds_next_clones_obj;    /* DMU_OT_DSL_CLONES */
        uint64_t ds_props_obj;          /* DMU_OT_DSL_PROPS for snaps */
-       uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */
+       uint64_t ds_userrefs_obj;       /* DMU_OT_USERREFS */
+       uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
@@ -111,6 +120,9 @@ typedef struct dsl_dataset {
        /* has internal locking: */
        bplist_t ds_deadlist;
 
+       /* to protect against multiple concurrent incremental recv */
+       kmutex_t ds_recvlock;
+
        /* protected by lock on pool's dp_dirty_datasets list */
        txg_node_t ds_dirty_link;
        list_node_t ds_synced_link;
@@ -122,6 +134,7 @@ typedef struct dsl_dataset {
        kmutex_t ds_lock;
        void *ds_user_ptr;
        dsl_dataset_evict_func_t *ds_user_evict_func;
+       uint64_t ds_userrefs;
 
        /*
         * ds_owner is protected by the ds_rwlock and the ds_lock
@@ -143,6 +156,15 @@ typedef struct dsl_dataset {
        char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
+struct dsl_ds_destroyarg {
+       dsl_dataset_t *ds;              /* ds to destroy */
+       dsl_dataset_t *rm_origin;       /* also remove our origin? */
+       boolean_t is_origin_rm;         /* set if removing origin snap */
+       boolean_t defer;                /* destroy -d requested? */
+       boolean_t releasing;            /* destroying due to release? */
+       boolean_t need_prep;            /* do we need to retry due to EBUSY? */
+};
+
 #define        dsl_dataset_is_snapshot(ds)     \
        ((ds)->ds_phys->ds_num_children != 0)
 
@@ -167,8 +189,8 @@ uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
+int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
 dsl_checkfunc_t dsl_dataset_destroy_check;
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
@@ -178,6 +200,11 @@ int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
+int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+    boolean_t recursive);
+int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+    boolean_t recursive);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
 
 void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func);
index b064c92..a26a3f7 100644 (file)
@@ -53,6 +53,8 @@ extern "C" {
 #define        ZFS_DELEG_PERM_GROUPQUOTA       "groupquota"
 #define        ZFS_DELEG_PERM_USERUSED         "userused"
 #define        ZFS_DELEG_PERM_GROUPUSED        "groupused"
+#define        ZFS_DELEG_PERM_HOLD             "hold"
+#define        ZFS_DELEG_PERM_RELEASE          "release"
 
 /*
  * Note: the names of properties that are marked delegatable are also
index 26018a4..5afaa1f 100644 (file)
@@ -69,7 +69,7 @@ dsl_syncfunc_t dsl_props_set_sync;
 int dsl_prop_set(const char *ddname, const char *propname,
     int intsz, int numints, const void *buf);
 int dsl_props_set(const char *dsname, nvlist_t *nvl);
-void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
     cred_t *cr, dmu_tx_t *tx);
 
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
index c7ae402..0a4d550 100644 (file)
@@ -500,8 +500,9 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf,
     history_log_type_t what);
-void spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_internal_log(history_internal_events_t event,
+    spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
 
 /* error handling */
 struct zbookmark;
index 12999ee..84da684 100644 (file)
@@ -105,6 +105,7 @@ struct spa {
        int             spa_inject_ref;         /* injection references */
        uint8_t         spa_sync_on;            /* sync threads are running */
        spa_load_state_t spa_load_state;        /* current load operation */
+       boolean_t       spa_load_verbatim;      /* load the given config? */
        taskq_t         *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
        dsl_pool_t      *spa_dsl_pool;
        metaslab_class_t *spa_normal_class;     /* normal data class */
index 71b9b12..7e53f62 100644 (file)
@@ -47,6 +47,7 @@ typedef enum vdev_dtl_type {
 extern boolean_t zfs_nocacheflush;
 
 extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *vd);
 extern int vdev_validate(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
index 8240b66..2378043 100644 (file)
@@ -127,6 +127,8 @@ struct vdev {
        space_map_t     vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
        vdev_stat_t     vdev_stat;      /* virtual device statistics    */
        boolean_t       vdev_expanding; /* expand the vdev?             */
+       int             vdev_open_error; /* error on last open          */
+       kthread_t       *vdev_open_thread; /* thread opening children   */
 
        /*
         * Top-level vdev state.
index de20538..967174b 100644 (file)
@@ -182,8 +182,7 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
     boolean_t *normalization_conflictp);
 
 int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
-    int add, uint64_t *towrite, uint64_t *tooverwrite,
-    uint64_t dn_datablkshift);
+    int add, uint64_t *towrite, uint64_t *tooverwrite);
 
 /*
  * Create an attribute with the given name and value.
index f5e5aa7..3488962 100644 (file)
@@ -203,6 +203,7 @@ void zfs_oldace_byteswap(ace_t *, int);
 void zfs_ace_byteswap(void *, size_t, boolean_t);
 extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
 extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
 extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
 extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
 extern int zfs_acl_access(struct znode *, int, cred_t *);
index 1e9f351..3a3e6e7 100644 (file)
@@ -165,6 +165,7 @@ typedef struct zfs_cmd {
        dmu_objset_stats_t zc_objset_stats;
        struct drr_begin zc_begin_record;
        zinject_record_t zc_inject_record;
+       boolean_t       zc_defer_destroy;
 } zfs_cmd_t;
 
 typedef struct zfs_useracct {
index b8ed7b2..2855523 100644 (file)
@@ -73,7 +73,6 @@ struct zfsvfs {
        boolean_t       z_vscan;        /* virus scan on/off */
        boolean_t       z_use_fuids;    /* version allows fuids */
        boolean_t       z_replay;       /* set during ZIL replay */
-       kmutex_t        z_online_recv_lock; /* held while recv in progress */
        uint64_t        z_version;      /* ZPL version */
        uint64_t        z_shares_dir;   /* hidden shares dir */
        kmutex_t        z_lock;
index 69f4b50..5db5b8d 100644 (file)
@@ -77,6 +77,7 @@ extern "C" {
 #define        ZFS_ACL_DEFAULTED       0x20            /* ACL should be defaulted */
 #define        ZFS_ACL_AUTO_INHERIT    0x40            /* ACL should be inherited */
 #define        ZFS_BONUS_SCANSTAMP     0x80            /* Scanstamp in bonus area */
+#define        ZFS_NO_EXECS_DENIED     0x100           /* exec was given to everyone */
 
 /*
  * Is ID ephemeral?
@@ -200,6 +201,7 @@ typedef struct znode {
        uint64_t        z_gen;          /* generation (same as zp_gen) */
        uint32_t        z_sync_cnt;     /* synchronous open count */
        kmutex_t        z_acl_lock;     /* acl data lock */
+       zfs_acl_t       *z_acl_cached;  /* cached acl */
        list_node_t     z_link_node;    /* all znodes in fs link */
        /*
         * These are dmu managed fields.
index 5c51717..e47d8f4 100644 (file)
@@ -143,6 +143,8 @@ enum zio_compress {
 #define        ZIO_FLAG_GODFATHER              0x080000
 
 #define        ZIO_FLAG_TRYHARD                0x100000
+#define        ZIO_FLAG_NODATA                 0x200000
+#define        ZIO_FLAG_OPTIONAL               0x400000
 
 #define        ZIO_FLAG_GANG_INHERIT           \
        (ZIO_FLAG_CANFAIL |             \
@@ -161,7 +163,9 @@ enum zio_compress {
        ZIO_FLAG_IO_REPAIR |            \
        ZIO_FLAG_IO_RETRY |             \
        ZIO_FLAG_PROBE |                \
-       ZIO_FLAG_TRYHARD)
+       ZIO_FLAG_TRYHARD |              \
+       ZIO_FLAG_NODATA |               \
+       ZIO_FLAG_OPTIONAL)
 
 #define        ZIO_FLAG_AGG_INHERIT            \
        (ZIO_FLAG_DONT_AGGREGATE |      \
index 710685d..4cef53f 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/refcount.h>
 #include <sys/rrwlock.h>
 
@@ -118,7 +116,7 @@ rrn_find_and_remove(rrwlock_t *rrl)
        rrw_node_t *prev = NULL;
 
        if (refcount_count(&rrl->rr_linked_rcount) == 0)
-               return (NULL);
+               return (B_FALSE);
 
        for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
                if (rn->rn_rrl == rrl) {
@@ -159,6 +157,14 @@ static void
 rrw_enter_read(rrwlock_t *rrl, void *tag)
 {
        mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+       if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+               rrl->rr_anon_rcount.rc_count++;
+               mutex_exit(&rrl->rr_lock);
+               return;
+       }
+       DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
        ASSERT(rrl->rr_writer != curthread);
        ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
@@ -208,19 +214,28 @@ void
 rrw_exit(rrwlock_t *rrl, void *tag)
 {
        mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+       if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+               rrl->rr_anon_rcount.rc_count--;
+               if (rrl->rr_anon_rcount.rc_count == 0)
+                       cv_broadcast(&rrl->rr_cv);
+               mutex_exit(&rrl->rr_lock);
+               return;
+       }
+       DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
        ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
            !refcount_is_zero(&rrl->rr_linked_rcount) ||
            rrl->rr_writer != NULL);
 
        if (rrl->rr_writer == NULL) {
-               if (rrn_find_and_remove(rrl)) {
-                       if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
-                               cv_broadcast(&rrl->rr_cv);
-
-               } else {
-                       if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
-                               cv_broadcast(&rrl->rr_cv);
-               }
+               int64_t count;
+               if (rrn_find_and_remove(rrl))
+                       count = refcount_remove(&rrl->rr_linked_rcount, tag);
+               else
+                       count = refcount_remove(&rrl->rr_anon_rcount, tag);
+               if (count == 0)
+                       cv_broadcast(&rrl->rr_cv);
        } else {
                ASSERT(rrl->rr_writer == curthread);
                ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
index 6a95b39..d7ed23e 100644 (file)
@@ -1574,9 +1574,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
                /*
                 * If the config cache is stale, or we have uninitialized
                 * metaslabs (see spa_vdev_add()), then update the config.
+                *
+                * If spa_load_verbatim is true, trust the current
+                * in-core spa_config and update the disk labels.
                 */
                if (config_cache_txg != spa->spa_config_txg ||
-                   state == SPA_LOAD_IMPORT)
+                   state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
                        need_update = B_TRUE;
 
                for (int c = 0; c < rvd->vdev_children; c++)
@@ -2271,6 +2274,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
        if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
                (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
+       spa_history_log_version(spa, LOG_POOL_CREATE);
 
        spa->spa_minref = refcount_count(&spa->spa_refcount);
 
@@ -2404,6 +2408,7 @@ spa_import_rootpool(char *devpath, char *devid)
 
        spa = spa_add(pname, NULL);
        spa->spa_is_root = B_TRUE;
+       spa->spa_load_verbatim = B_TRUE;
 
        /*
         * Build up a vdev tree based on the boot device's label config.
@@ -2459,6 +2464,7 @@ spa_import_rootpool(char *devpath, char *devid)
 
        VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
        error = 0;
+       spa_history_log_version(spa, LOG_POOL_IMPORT);
 out:
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
        vdev_free(rvd);
@@ -2491,6 +2497,8 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
            zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
        spa = spa_add(pool, altroot);
 
+       spa->spa_load_verbatim = B_TRUE;
+
        VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 
        if (props != NULL)
@@ -2499,6 +2507,7 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
        spa_config_sync(spa, B_FALSE, B_TRUE);
 
        mutex_exit(&spa_namespace_lock);
+       spa_history_log_version(spa, LOG_POOL_IMPORT);
 
        return (0);
 }
@@ -2624,7 +2633,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                /*
                 * Update the config cache to include the newly-imported pool.
                 */
-               spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE);
+               spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
        }
 
        /*
@@ -2634,6 +2643,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
        spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
        mutex_exit(&spa_namespace_lock);
+       spa_history_log_version(spa, LOG_POOL_IMPORT);
 
        return (0);
 }
@@ -2991,7 +3001,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
        vdev_ops_t *pvops;
-       dmu_tx_t *tx;
        char *oldvdpath, *newvdpath;
        int newvd_isspare;
        int error;
@@ -3147,17 +3156,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
        (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
-       tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-       if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
-               spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
-                   CRED(),  "%s vdev=%s %s vdev=%s",
-                   replacing && newvd_isspare ? "spare in" :
-                   replacing ? "replace" : "attach", newvdpath,
-                   replacing ? "for" : "to", oldvdpath);
-               dmu_tx_commit(tx);
-       } else {
-               dmu_tx_abort(tx);
-       }
+       spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
+           CRED(),  "%s vdev=%s %s vdev=%s",
+           replacing && newvd_isspare ? "spare in" :
+           replacing ? "replace" : "attach", newvdpath,
+           replacing ? "for" : "to", oldvdpath);
 
        spa_strfree(oldvdpath);
        spa_strfree(newvdpath);
@@ -3747,19 +3750,11 @@ spa_async_thread(spa_t *spa)
                 * then log an internal history event.
                 */
                if (space_update) {
-                       dmu_tx_t *tx;
-
-                       tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-                       if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
-                               spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
-                                   spa, tx, CRED(),
-                                   "pool '%s' size: %llu(+%llu)",
-                                   spa_name(spa), spa_get_space(spa),
-                                   space_update);
-                               dmu_tx_commit(tx);
-                       } else {
-                               dmu_tx_abort(tx);
-                       }
+                       spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
+                           spa, NULL, CRED(),
+                           "pool '%s' size: %llu(+%llu)",
+                           spa_name(spa), spa_get_space(spa),
+                           space_update);
                }
        }
 
index 7103e17..b2063bb 100644 (file)
@@ -209,7 +209,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-       if (rootdir == NULL)
+       if (rootdir == NULL || !(spa_mode_global & FWRITE))
                return;
 
        /*
@@ -394,23 +394,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 }
 
 /*
- * For a pool that's not currently a booting rootpool, update all disk labels,
- * generate a fresh config based on the current in-core state, and sync the
- * global config cache.
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
-       spa_config_update_common(spa, what, FALSE);
-}
-
-/*
  * Update all disk labels, generate a fresh config based on the current
  * in-core state, and sync the global config cache (do not sync the config
  * cache if this is a booting rootpool).
  */
 void
-spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
+spa_config_update(spa_t *spa, int what)
 {
        vdev_t *rvd = spa->spa_root_vdev;
        uint64_t txg;
@@ -447,9 +436,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
        /*
         * Update the global config cache to reflect the new mosconfig.
         */
-       if (!isroot)
+       if (!spa->spa_is_root)
                spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
 
        if (what == SPA_CONFIG_UPDATE_POOL)
-               spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot);
+               spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
 }
index 97d97d8..b77ac42 100644 (file)
@@ -390,13 +390,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
        return (err);
 }
 
-void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+static void
+log_internal(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
 {
        history_arg_t *hap;
        char *str;
-       va_list adx;
 
        /*
         * If this is part of creating a pool, not everything is
@@ -408,9 +407,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
        hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
        str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
 
-       va_start(adx, fmt);
        (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
-       va_end(adx);
 
        hap->ha_log_type = LOG_INTERNAL;
        hap->ha_history_str = str;
@@ -425,3 +422,48 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
        }
        /* spa_history_log_sync() will free hap and str */
 }
+
+void
+spa_history_internal_log(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+{
+       dmu_tx_t *htx = tx;
+       va_list adx;
+
+       /* create a tx if we didn't get one */
+       if (tx == NULL) {
+               htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+               if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+                       dmu_tx_abort(htx);
+                       return;
+               }
+       }
+
+       va_start(adx, fmt);
+       log_internal(event, spa, htx, cr, fmt, adx);
+       va_end(adx);
+
+       /* if we didn't get a tx from the caller, commit the one we made */
+       if (tx == NULL)
+               dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_version(spa_t *spa, history_internal_events_t event)
+{
+#ifdef _KERNEL
+       uint64_t current_vers = spa_version(spa);
+
+       if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
+               spa_history_internal_log(event, spa, NULL, CRED(),
+                   "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
+                   (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
+                   utsname.nodename, utsname.release, utsname.version,
+                   utsname.machine);
+       }
+       cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
+           event == LOG_POOL_IMPORT ? "imported" :
+           event == LOG_POOL_CREATE ? "created" : "accessed",
+           (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
+#endif
+}
index aea3f56..8150ac9 100644 (file)
@@ -310,8 +310,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 void
 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
 {
+       int wlocks_held = 0;
+
        for (int i = 0; i < SCL_LOCKS; i++) {
                spa_config_lock_t *scl = &spa->spa_config_lock[i];
+               if (scl->scl_writer == curthread)
+                       wlocks_held |= (1 << i);
                if (!(locks & (1 << i)))
                        continue;
                mutex_enter(&scl->scl_lock);
@@ -331,6 +335,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
                (void) refcount_add(&scl->scl_count, tag);
                mutex_exit(&scl->scl_lock);
        }
+       ASSERT(wlocks_held <= locks);
 }
 
 void
index 3fa677e..bb5024f 100644 (file)
@@ -405,22 +405,26 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
                    &nparity) == 0) {
                        /*
-                        * Currently, we can only support 2 parity devices.
+                        * Currently, we can only support 3 parity devices.
                         */
-                       if (nparity == 0 || nparity > 2)
+                       if (nparity == 0 || nparity > 3)
                                return (EINVAL);
                        /*
-                        * Older versions can only support 1 parity device.
+                        * Previous versions could only support 1 or 2 parity
+                        * device.
                         */
-                       if (nparity == 2 &&
-                           spa_version(spa) < SPA_VERSION_RAID6)
+                       if (nparity > 1 &&
+                           spa_version(spa) < SPA_VERSION_RAIDZ2)
+                               return (ENOTSUP);
+                       if (nparity > 2 &&
+                           spa_version(spa) < SPA_VERSION_RAIDZ3)
                                return (ENOTSUP);
                } else {
                        /*
                         * We require the parity to be specified for SPAs that
                         * support multiple parity levels.
                         */
-                       if (spa_version(spa) >= SPA_VERSION_RAID6)
+                       if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
                                return (EINVAL);
                        /*
                         * Otherwise, we default to 1 parity device for RAID-Z.
@@ -993,6 +997,32 @@ vdev_probe(vdev_t *vd, zio_t *zio)
        return (NULL);
 }
 
+static void
+vdev_open_child(void *arg)
+{
+       vdev_t *vd = arg;
+
+       vd->vdev_open_thread = curthread;
+       vd->vdev_open_error = vdev_open(vd);
+       vd->vdev_open_thread = NULL;
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+       taskq_t *tq;
+       int children = vd->vdev_children;
+
+       tq = taskq_create("vdev_open", children, minclsyspri,
+           children, children, TASKQ_PREPOPULATE);
+
+       for (int c = 0; c < children; c++)
+               VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+                   TQ_SLEEP) != NULL);
+
+       taskq_destroy(tq);
+}
+
 /*
  * Prepare a virtual device for access.
  */
@@ -1005,8 +1035,8 @@ vdev_open(vdev_t *vd)
        uint64_t asize, psize;
        uint64_t ashift = 0;
 
-       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
+       ASSERT(vd->vdev_open_thread == curthread ||
+           spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
        ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
            vd->vdev_state == VDEV_STATE_CANT_OPEN ||
            vd->vdev_state == VDEV_STATE_OFFLINE);
@@ -1217,7 +1247,12 @@ vdev_validate(vdev_t *vd)
 
                nvlist_free(label);
 
-               if (spa->spa_load_state == SPA_LOAD_OPEN &&
+               /*
+                * If spa->spa_load_verbatim is true, no need to check the
+                * state of the pool.
+                */
+               if (!spa->spa_load_verbatim &&
+                   spa->spa_load_state == SPA_LOAD_OPEN &&
                    state != POOL_STATE_ACTIVE)
                        return (EBADF);
 
index 48d5fc2..06cb720 100644 (file)
@@ -246,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                 * into a crufty old storage pool.
                 */
                ASSERT(vd->vdev_nparity == 1 ||
-                   (vd->vdev_nparity == 2 &&
-                   spa_version(spa) >= SPA_VERSION_RAID6));
+                   (vd->vdev_nparity <= 2 &&
+                   spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+                   (vd->vdev_nparity <= 3 &&
+                   spa_version(spa) >= SPA_VERSION_RAIDZ3));
 
                /*
                 * Note that we'll add the nparity tag even on storage pools
@@ -642,8 +644,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
        /*
         * Initialize uberblock template.
         */
-       ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
-       bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+       ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+       bzero(ub, VDEV_UBERBLOCK_RING);
        *ub = spa->spa_uberblock;
        ub->ub_txg = 0;
 
@@ -672,11 +674,9 @@ retry:
                    offsetof(vdev_label_t, vl_pad2),
                    VDEV_PAD_SIZE, NULL, NULL, flags);
 
-               for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-                       vdev_label_write(zio, vd, l, ub,
-                           VDEV_UBERBLOCK_OFFSET(vd, n),
-                           VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
-               }
+               vdev_label_write(zio, vd, l, ub,
+                   offsetof(vdev_label_t, vl_uberblock),
+                   VDEV_UBERBLOCK_RING, NULL, NULL, flags);
        }
 
        error = zio_wait(zio);
@@ -688,7 +688,7 @@ retry:
 
        nvlist_free(label);
        zio_buf_free(pad2, VDEV_PAD_SIZE);
-       zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+       zio_buf_free(ub, VDEV_UBERBLOCK_RING);
        zio_buf_free(vp, sizeof (vdev_phys_t));
 
        /*
index fff7e08..836386d 100644 (file)
@@ -124,21 +124,21 @@ vdev_mirror_map_alloc(zio_t *zio)
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
-       vdev_t *cvd;
-       uint64_t c;
        int numerrors = 0;
-       int ret, lasterror = 0;
+       int lasterror = 0;
 
        if (vd->vdev_children == 0) {
                vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
                return (EINVAL);
        }
 
-       for (c = 0; c < vd->vdev_children; c++) {
-               cvd = vd->vdev_child[c];
+       vdev_open_children(vd);
 
-               if ((ret = vdev_open(cvd)) != 0) {
-                       lasterror = ret;
+       for (int c = 0; c < vd->vdev_children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+
+               if (cvd->vdev_open_error) {
+                       lasterror = cvd->vdev_open_error;
                        numerrors++;
                        continue;
                }
@@ -158,9 +158,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 static void
 vdev_mirror_close(vdev_t *vd)
 {
-       uint64_t c;
-
-       for (c = 0; c < vd->vdev_children; c++)
+       for (int c = 0; c < vd->vdev_children; c++)
                vdev_close(vd->vdev_child[c]);
 }
 
index 5e57a15..9867d09 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
@@ -48,11 +48,14 @@ int zfs_vdev_time_shift = 6;
 int zfs_vdev_ramp_rate = 2;
 
 /*
- * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
- * For read i/os, we also aggregate across small adjacency gaps.
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
  */
 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
 
 /*
  * Virtual device vector for disk I/O scheduling.
@@ -172,12 +175,14 @@ vdev_queue_agg_io_done(zio_t *aio)
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
-       zio_t *fio, *lio, *aio, *dio, *nio;
+       zio_t *fio, *lio, *aio, *dio, *nio, *mio;
        avl_tree_t *t;
        int flags;
        uint64_t maxspan = zfs_vdev_aggregation_limit;
        uint64_t maxgap;
+       int stretch;
 
+again:
        ASSERT(MUTEX_HELD(&vq->vq_lock));
 
        if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
@@ -192,21 +197,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 
        if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
                /*
-                * We can aggregate I/Os that are adjacent and of the
-                * same flavor, as expressed by the AGG_INHERIT flags.
-                * The latter is necessary so that certain attributes
-                * of the I/O, such as whether it's a normal I/O or a
-                * scrub/resilver, can be preserved in the aggregate.
+                * We can aggregate I/Os that are sufficiently adjacent and of
+                * the same flavor, as expressed by the AGG_INHERIT flags.
+                * The latter requirement is necessary so that certain
+                * attributes of the I/O, such as whether it's a normal I/O
+                * or a scrub/resilver, can be preserved in the aggregate.
+                * We can include optional I/Os, but don't allow them
+                * to begin a range as they add no benefit in that situation.
+                */
+
+               /*
+                * We keep track of the last non-optional I/O.
+                */
+               mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+
+               /*
+                * Walk backwards through sufficiently contiguous I/Os
+                * recording the last non-option I/O.
                 */
                while ((dio = AVL_PREV(t, fio)) != NULL &&
                    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-                   IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
+                   IO_SPAN(dio, lio) <= maxspan &&
+                   IO_GAP(dio, fio) <= maxgap) {
                        fio = dio;
+                       if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
+                               mio = fio;
+               }
 
+               /*
+                * Skip any initial optional I/Os.
+                */
+               while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
+                       fio = AVL_NEXT(t, fio);
+                       ASSERT(fio != NULL);
+               }
+
+               /*
+                * Walk forward through sufficiently contiguous I/Os.
+                */
                while ((dio = AVL_NEXT(t, lio)) != NULL &&
                    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-                   IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
+                   IO_SPAN(fio, dio) <= maxspan &&
+                   IO_GAP(lio, dio) <= maxgap) {
                        lio = dio;
+                       if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
+                               mio = lio;
+               }
+
+               /*
+                * Now that we've established the range of the I/O aggregation
+                * we must decide what to do with trailing optional I/Os.
+                * For reads, there's nothing to do. While we are unable to
+                * aggregate further, it's possible that a trailing optional
+                * I/O would allow the underlying device to aggregate with
+                * subsequent I/Os. We must therefore determine if the next
+                * non-optional I/O is close enough to make aggregation
+                * worthwhile.
+                */
+               stretch = B_FALSE;
+               if (t != &vq->vq_read_tree && mio != NULL) {
+                       nio = lio;
+                       while ((dio = AVL_NEXT(t, nio)) != NULL &&
+                           IO_GAP(nio, dio) == 0 &&
+                           IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
+                               nio = dio;
+                               if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+                                       stretch = B_TRUE;
+                                       break;
+                               }
+                       }
+               }
+
+               if (stretch) {
+                       /* This may be a no-op. */
+                       VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
+                       dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+               } else {
+                       while (lio != mio && lio != fio) {
+                               ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
+                               lio = AVL_PREV(t, lio);
+                               ASSERT(lio != NULL);
+                       }
+               }
        }
 
        if (fio != lio) {
@@ -225,10 +297,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
                        ASSERT(dio->io_type == aio->io_type);
                        ASSERT(dio->io_vdev_tree == t);
 
-                       if (dio->io_type == ZIO_TYPE_WRITE)
+                       if (dio->io_flags & ZIO_FLAG_NODATA) {
+                               ASSERT(dio->io_type == ZIO_TYPE_WRITE);
+                               bzero((char *)aio->io_data + (dio->io_offset -
+                                   aio->io_offset), dio->io_size);
+                       } else if (dio->io_type == ZIO_TYPE_WRITE) {
                                bcopy(dio->io_data, (char *)aio->io_data +
                                    (dio->io_offset - aio->io_offset),
                                    dio->io_size);
+                       }
 
                        zio_add_child(dio, aio);
                        vdev_queue_io_remove(vq, dio);
@@ -244,6 +321,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
        ASSERT(fio->io_vdev_tree == t);
        vdev_queue_io_remove(vq, fio);
 
+       /*
+        * If the I/O is or was optional and therefore has no data, we need to
+        * simply discard it. We need to drop the vdev queue's lock to avoid a
+        * deadlock that we could encounter since this I/O will complete
+        * immediately.
+        */
+       if (fio->io_flags & ZIO_FLAG_NODATA) {
+               mutex_exit(&vq->vq_lock);
+               zio_vdev_io_bypass(fio);
+               zio_execute(fio);
+               mutex_enter(&vq->vq_lock);
+               goto again;
+       }
+
        avl_add(&vq->vq_pending_tree, fio);
 
        return (fio);
index 92753d8..b307417 100644 (file)
 /*
  * Virtual device vector for RAID-Z.
  *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *     (A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
  *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
  *
  *     P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *     Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *       = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *     R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ *       = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
@@ -84,21 +109,49 @@ typedef struct raidz_col {
 } raidz_col_t;
 
 typedef struct raidz_map {
-       uint64_t rm_cols;               /* Column count */
+       uint64_t rm_cols;               /* Regular column count */
+       uint64_t rm_scols;              /* Count including skipped columns */
        uint64_t rm_bigcols;            /* Number of oversized columns */
        uint64_t rm_asize;              /* Actual total I/O size */
        uint64_t rm_missingdata;        /* Count of missing data devices */
        uint64_t rm_missingparity;      /* Count of missing parity devices */
        uint64_t rm_firstdatacol;       /* First data column/parity count */
+       uint64_t rm_skipped;            /* Skipped sectors for padding */
        raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define        VDEV_RAIDZ_P            0
 #define        VDEV_RAIDZ_Q            1
+#define        VDEV_RAIDZ_R            2
+#define        VDEV_RAIDZ_MAXPARITY    3
+
+#define        VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define        VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define        VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+       (mask) = (x) & 0x8080808080808080ULL; \
+       (mask) = ((mask) << 1) - ((mask) >> 7); \
+       (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+           ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
 
-#define        VDEV_RAIDZ_MAXPARITY    2
+#define        VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+       VDEV_RAIDZ_64MUL_2((x), mask); \
+       VDEV_RAIDZ_64MUL_2((x), mask); \
+}
 
-#define        VDEV_RAIDZ_MUL_2(a)     (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
 
 /*
  * These two tables represent powers and logs of 2 in the Galois field defined
@@ -201,7 +254,7 @@ vdev_raidz_map_free(zio_t *zio)
        for (c = 0; c < rm->rm_firstdatacol; c++)
                zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
-       kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+       kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
 static raidz_map_t *
@@ -213,24 +266,35 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
        uint64_t s = zio->io_size >> unit_shift;
        uint64_t f = b % dcols;
        uint64_t o = (b / dcols) << unit_shift;
-       uint64_t q, r, c, bc, col, acols, coff, devidx;
+       uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
        q = s / (dcols - nparity);
        r = s - q * (dcols - nparity);
        bc = (r == 0 ? 0 : r + nparity);
+       tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+       if (q == 0) {
+               acols = bc;
+               scols = MIN(dcols, roundup(bc, nparity + 1));
+       } else {
+               acols = dcols;
+               scols = dcols;
+       }
 
-       acols = (q == 0 ? bc : dcols);
+       ASSERT3U(acols, <=, scols);
 
-       rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+       rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 
        rm->rm_cols = acols;
+       rm->rm_scols = scols;
        rm->rm_bigcols = bc;
-       rm->rm_asize = 0;
        rm->rm_missingdata = 0;
        rm->rm_missingparity = 0;
        rm->rm_firstdatacol = nparity;
 
-       for (c = 0; c < acols; c++) {
+       asize = 0;
+
+       for (c = 0; c < scols; c++) {
                col = f + c;
                coff = o;
                if (col >= dcols) {
@@ -239,15 +303,26 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                }
                rm->rm_col[c].rc_devidx = col;
                rm->rm_col[c].rc_offset = coff;
-               rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
                rm->rm_col[c].rc_data = NULL;
                rm->rm_col[c].rc_error = 0;
                rm->rm_col[c].rc_tried = 0;
                rm->rm_col[c].rc_skipped = 0;
-               rm->rm_asize += rm->rm_col[c].rc_size;
+
+               if (c >= acols)
+                       rm->rm_col[c].rc_size = 0;
+               else if (c < bc)
+                       rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+               else
+                       rm->rm_col[c].rc_size = q << unit_shift;
+
+               asize += rm->rm_col[c].rc_size;
        }
 
-       rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+       ASSERT3U(asize, ==, tot << unit_shift);
+       rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+       rm->rm_skipped = roundup(tot, nparity + 1) - tot;
+       ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
+       ASSERT3U(rm->rm_skipped, <=, nparity);
 
        for (c = 0; c < rm->rm_firstdatacol; c++)
                rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -305,12 +380,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 
                if (c == rm->rm_firstdatacol) {
                        ASSERT(ccount == pcount);
-                       for (i = 0; i < ccount; i++, p++, src++) {
+                       for (i = 0; i < ccount; i++, src++, p++) {
                                *p = *src;
                        }
                } else {
                        ASSERT(ccount <= pcount);
-                       for (i = 0; i < ccount; i++, p++, src++) {
+                       for (i = 0; i < ccount; i++, src++, p++) {
                                *p ^= *src;
                        }
                }
@@ -320,10 +395,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
-       uint64_t *q, *p, *src, pcount, ccount, mask, i;
+       uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
        int c;
 
-       pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+       pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
        ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
            rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
@@ -331,55 +406,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
                src = rm->rm_col[c].rc_data;
                p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
                q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-               ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+               ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
                if (c == rm->rm_firstdatacol) {
-                       ASSERT(ccount == pcount || ccount == 0);
-                       for (i = 0; i < ccount; i++, p++, q++, src++) {
-                               *q = *src;
+                       ASSERT(ccnt == pcnt || ccnt == 0);
+                       for (i = 0; i < ccnt; i++, src++, p++, q++) {
                                *p = *src;
+                               *q = *src;
                        }
-                       for (; i < pcount; i++, p++, q++, src++) {
-                               *q = 0;
+                       for (; i < pcnt; i++, src++, p++, q++) {
                                *p = 0;
+                               *q = 0;
                        }
                } else {
-                       ASSERT(ccount <= pcount);
+                       ASSERT(ccnt <= pcnt);
 
                        /*
-                        * Rather than multiplying each byte individually (as
-                        * described above), we are able to handle 8 at once
-                        * by generating a mask based on the high bit in each
-                        * byte and using that to conditionally XOR in 0x1d.
+                        * Apply the algorithm described above by multiplying
+                        * the previous result and adding in the new value.
                         */
-                       for (i = 0; i < ccount; i++, p++, q++, src++) {
-                               mask = *q & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                       for (i = 0; i < ccnt; i++, src++, p++, q++) {
+                               *p ^= *src;
+
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
                                *q ^= *src;
+                       }
+
+                       /*
+                        * Treat short columns as though they are full of 0s.
+                        * Note that there's therefore nothing needed for P.
+                        */
+                       for (; i < pcnt; i++, q++) {
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
+                       }
+               }
+       }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+       uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+       int c;
+
+       pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+       ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+           rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+       ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+           rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+               src = rm->rm_col[c].rc_data;
+               p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+               q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+               r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+               ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+               if (c == rm->rm_firstdatacol) {
+                       ASSERT(ccnt == pcnt || ccnt == 0);
+                       for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+                               *p = *src;
+                               *q = *src;
+                               *r = *src;
+                       }
+                       for (; i < pcnt; i++, src++, p++, q++, r++) {
+                               *p = 0;
+                               *q = 0;
+                               *r = 0;
+                       }
+               } else {
+                       ASSERT(ccnt <= pcnt);
+
+                       /*
+                        * Apply the algorithm described above by multiplying
+                        * the previous result and adding in the new value.
+                        */
+                       for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
                                *p ^= *src;
+
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
+                               *q ^= *src;
+
+                               VDEV_RAIDZ_64MUL_4(*r, mask);
+                               *r ^= *src;
                        }
 
                        /*
                         * Treat short columns as though they are full of 0s.
+                        * Note that there's therefore nothing needed for P.
                         */
-                       for (; i < pcount; i++, q++) {
-                               mask = *q & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                       for (; i < pcnt; i++, q++, r++) {
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
+                               VDEV_RAIDZ_64MUL_4(*r, mask);
                        }
                }
        }
 }
 
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
 static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+       switch (rm->rm_firstdatacol) {
+       case 1:
+               vdev_raidz_generate_parity_p(rm);
+               break;
+       case 2:
+               vdev_raidz_generate_parity_pq(rm);
+               break;
+       case 3:
+               vdev_raidz_generate_parity_pqr(rm);
+               break;
+       default:
+               cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+       }
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
        uint64_t *dst, *src, xcount, ccount, count, i;
+       int x = tgts[0];
        int c;
 
+       ASSERT(ntgts == 1);
+       ASSERT(x >= rm->rm_firstdatacol);
+       ASSERT(x < rm->rm_cols);
+
        xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
        ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
        ASSERT(xcount > 0);
@@ -404,15 +562,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
                        *dst ^= *src;
                }
        }
+
+       return (1 << VDEV_RAIDZ_P);
 }
 
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
        uint64_t *dst, *src, xcount, ccount, count, mask, i;
        uint8_t *b;
+       int x = tgts[0];
        int c, j, exp;
 
+       ASSERT(ntgts == 1);
+
        xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
        ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
@@ -436,23 +599,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
                        }
 
                } else {
-                       /*
-                        * For an explanation of this, see the comment in
-                        * vdev_raidz_generate_parity_pq() above.
-                        */
                        for (i = 0; i < count; i++, dst++, src++) {
-                               mask = *dst & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                               VDEV_RAIDZ_64MUL_2(*dst, mask);
                                *dst ^= *src;
                        }
 
                        for (; i < xcount; i++, dst++) {
-                               mask = *dst & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                               VDEV_RAIDZ_64MUL_2(*dst, mask);
                        }
                }
        }
@@ -467,15 +620,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
                        *b = vdev_raidz_exp2(*b, exp);
                }
        }
+
+       return (1 << VDEV_RAIDZ_Q);
 }
 
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
        uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
        void *pdata, *qdata;
        uint64_t xsize, ysize, i;
+       int x = tgts[0];
+       int y = tgts[1];
 
+       ASSERT(ntgts == 2);
        ASSERT(x < y);
        ASSERT(x >= rm->rm_firstdatacol);
        ASSERT(y < rm->rm_cols);
@@ -553,15 +711,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
         */
        rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
        rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+       return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ *            __   __                     __     __
+ *            |     |         __     __   |  p_0  |
+ *            |  V  |         |  D_0  |   | p_m-1 |
+ *            |     |    x    |   :   | = |  d_0  |
+ *            |  I  |         | D_n-1 |   |   :   |
+ *            |     |         ~~     ~~   | d_n-1 |
+ *            ~~   ~~                     ~~     ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ *      __               __               __     __
+ *      |   1   ..  1 1 1 |               |  p_0  |
+ *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
+ *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
+ *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
+ *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
+ *      |   :       : : : |   |   :   |   |  d_2  |
+ *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
+ *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
+ *      |   0   ..  0 0 1 |               | d_n-1 |
+ *      ~~               ~~               ~~     ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
+ *           |  19 205 116  29  64  16  4   1  |      / /
+ *           |  1   0   0   0   0   0   0   0  |     / /
+ *           |  0   1   0   0   0   0   0   0  | <--' /
+ *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  |
+ *           |  19 205 116  29  64  16  4   1  |
+ *           |  1   0   0   0   0   0   0   0  |
+ *           |  0   1   0   0   0   0   0   0  |
+ *  (V|I)' = |  0   0   1   0   0   0   0   0  |
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __                                                                 __
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ *                   __                               __
+ *                   |  0   0   1   0   0   0   0   0  |
+ *                   | 167 100  5   41 159 169 217 208 |
+ *                   | 166 100  4   40 158 168 216 209 |
+ *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
+ *                   |  0   0   0   0   1   0   0   0  |
+ *                   |  0   0   0   0   0   1   0   0  |
+ *                   |  0   0   0   0   0   0   1   0  |
+ *                   |  0   0   0   0   0   0   0   1  |
+ *                   ~~                               ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+    uint8_t **rows)
+{
+       int i, j;
+       int pow;
+
+       ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+       /*
+        * Fill in the missing rows of interest.
+        */
+       for (i = 0; i < nmap; i++) {
+               ASSERT3S(0, <=, map[i]);
+               ASSERT3S(map[i], <=, 2);
+
+               pow = map[i] * n;
+               if (pow > 255)
+                       pow -= 255;
+               ASSERT(pow <= 255);
+
+               for (j = 0; j < n; j++) {
+                       pow -= map[i];
+                       if (pow < 0)
+                               pow += 255;
+                       rows[i][j] = vdev_raidz_pow2[pow];
+               }
+       }
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+       int i, j, ii, jj;
+       uint8_t log;
+
+       /*
+        * Assert that the first nmissing entries from the array of used
+        * columns correspond to parity columns and that subsequent entries
+        * correspond to data columns.
+        */
+       for (i = 0; i < nmissing; i++) {
+               ASSERT3S(used[i], <, rm->rm_firstdatacol);
+       }
+       for (; i < n; i++) {
+               ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+       }
+
+       /*
+        * First initialize the storage where we'll compute the inverse rows.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < n; j++) {
+                       invrows[i][j] = (i == j) ? 1 : 0;
+               }
+       }
+
+       /*
+        * Subtract all trivial rows from the rows of consequence.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = nmissing; j < n; j++) {
+                       ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+                       jj = used[j] - rm->rm_firstdatacol;
+                       ASSERT3S(jj, <, n);
+                       invrows[i][j] = rows[i][jj];
+                       rows[i][jj] = 0;
+               }
+       }
+
+       /*
+        * For each of the rows of interest, we must normalize it and subtract
+        * a multiple of it from the other rows.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < missing[i]; j++) {
+                       ASSERT3U(rows[i][j], ==, 0);
+               }
+               ASSERT3U(rows[i][missing[i]], !=, 0);
+
+               /*
+                * Compute the inverse of the first element and multiply each
+                * element in the row by that value.
+                */
+               log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+               for (j = 0; j < n; j++) {
+                       rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+                       invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+               }
+
+               for (ii = 0; ii < nmissing; ii++) {
+                       if (i == ii)
+                               continue;
+
+                       ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+                       log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+                       for (j = 0; j < n; j++) {
+                               rows[ii][j] ^=
+                                   vdev_raidz_exp2(rows[i][j], log);
+                               invrows[ii][j] ^=
+                                   vdev_raidz_exp2(invrows[i][j], log);
+                       }
+               }
+       }
+
+       /*
+        * Verify that the data that is left in the rows are properly part of
+        * an identity matrix.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < n; j++) {
+                       if (j == missing[i]) {
+                               ASSERT3U(rows[i][j], ==, 1);
+                       } else {
+                               ASSERT3U(rows[i][j], ==, 0);
+                       }
+               }
+       }
 }
 
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+    int *missing, uint8_t **invrows, const uint8_t *used)
+{
+       int i, j, x, cc, c;
+       uint8_t *src;
+       uint64_t ccount;
+       uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+       uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+       uint8_t log, val;
+       int ll;
+       uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+       uint8_t *p, *pp;
+       size_t psize;
+
+       psize = sizeof (invlog[0][0]) * n * nmissing;
+       p = kmem_alloc(psize, KM_SLEEP);
+
+       for (pp = p, i = 0; i < nmissing; i++) {
+               invlog[i] = pp;
+               pp += n;
+       }
+
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < n; j++) {
+                       ASSERT3U(invrows[i][j], !=, 0);
+                       invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+               }
+       }
+
+       for (i = 0; i < n; i++) {
+               c = used[i];
+               ASSERT3U(c, <, rm->rm_cols);
+
+               src = rm->rm_col[c].rc_data;
+               ccount = rm->rm_col[c].rc_size;
+               for (j = 0; j < nmissing; j++) {
+                       cc = missing[j] + rm->rm_firstdatacol;
+                       ASSERT3U(cc, >=, rm->rm_firstdatacol);
+                       ASSERT3U(cc, <, rm->rm_cols);
+                       ASSERT3U(cc, !=, c);
+
+                       dst[j] = rm->rm_col[cc].rc_data;
+                       dcount[j] = rm->rm_col[cc].rc_size;
+               }
+
+               ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+               for (x = 0; x < ccount; x++, src++) {
+                       if (*src != 0)
+                               log = vdev_raidz_log2[*src];
+
+                       for (cc = 0; cc < nmissing; cc++) {
+                               if (x >= dcount[cc])
+                                       continue;
+
+                               if (*src == 0) {
+                                       val = 0;
+                               } else {
+                                       if ((ll = log + invlog[cc][i]) >= 255)
+                                               ll -= 255;
+                                       val = vdev_raidz_pow2[ll];
+                               }
+
+                               if (i == 0)
+                                       dst[cc][x] = val;
+                               else
+                                       dst[cc][x] ^= val;
+                       }
+               }
+       }
+
+       kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+       int n, i, c, t, tt;
+       int nmissing_rows;
+       int missing_rows[VDEV_RAIDZ_MAXPARITY];
+       int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+       uint8_t *p, *pp;
+       size_t psize;
+
+       uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+       uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+       uint8_t *used;
+
+       int code = 0;
+
+
+       n = rm->rm_cols - rm->rm_firstdatacol;
+
+       /*
+        * Figure out which data columns are missing.
+        */
+       nmissing_rows = 0;
+       for (t = 0; t < ntgts; t++) {
+               if (tgts[t] >= rm->rm_firstdatacol) {
+                       missing_rows[nmissing_rows++] =
+                           tgts[t] - rm->rm_firstdatacol;
+               }
+       }
+
+       /*
+        * Figure out which parity columns to use to help generate the missing
+        * data columns.
+        */
+       for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+               ASSERT(tt < ntgts);
+               ASSERT(c < rm->rm_firstdatacol);
+
+               /*
+                * Skip any targeted parity columns.
+                */
+               if (c == tgts[tt]) {
+                       tt++;
+                       continue;
+               }
+
+               code |= 1 << c;
+
+               parity_map[i] = c;
+               i++;
+       }
+
+       ASSERT(code != 0);
+       ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+       psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+           nmissing_rows * n + sizeof (used[0]) * n;
+       p = kmem_alloc(psize, KM_SLEEP);
+
+       for (pp = p, i = 0; i < nmissing_rows; i++) {
+               rows[i] = pp;
+               pp += n;
+               invrows[i] = pp;
+               pp += n;
+       }
+       used = pp;
+
+       for (i = 0; i < nmissing_rows; i++) {
+               used[i] = parity_map[i];
+       }
+
+       for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+               if (tt < nmissing_rows &&
+                   c == missing_rows[tt] + rm->rm_firstdatacol) {
+                       tt++;
+                       continue;
+               }
+
+               ASSERT3S(i, <, n);
+               used[i] = c;
+               i++;
+       }
+
+       /*
+        * Initialize the interesting rows of the matrix.
+        */
+       vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+       /*
+        * Invert the matrix.
+        */
+       vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+           invrows, used);
+
+       /*
+        * Reconstruct the missing data using the generated matrix.
+        */
+       vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+           invrows, used);
+
+       kmem_free(p, psize);
+
+       return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+       int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+       int ntgts;
+       int i, c;
+       int code;
+       int nbadparity, nbaddata;
+       int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+       /*
+        * The tgts list must already be sorted.
+        */
+       for (i = 1; i < nt; i++) {
+               ASSERT(t[i] > t[i - 1]);
+       }
+
+       nbadparity = rm->rm_firstdatacol;
+       nbaddata = rm->rm_cols - nbadparity;
+       ntgts = 0;
+       for (i = 0, c = 0; c < rm->rm_cols; c++) {
+               if (c < rm->rm_firstdatacol)
+                       parity_valid[c] = B_FALSE;
+
+               if (i < nt && c == t[i]) {
+                       tgts[ntgts++] = c;
+                       i++;
+               } else if (rm->rm_col[c].rc_error != 0) {
+                       tgts[ntgts++] = c;
+               } else if (c >= rm->rm_firstdatacol) {
+                       nbaddata--;
+               } else {
+                       parity_valid[c] = B_TRUE;
+                       nbadparity--;
+               }
+       }
+
+       ASSERT(ntgts >= nt);
+       ASSERT(nbaddata >= 0);
+       ASSERT(nbaddata + nbadparity == ntgts);
+
+       dt = &tgts[nbadparity];
+
+       /*
+        * See if we can use any of our optimized reconstruction routines.
+        */
+       if (!vdev_raidz_default_to_general) {
+               switch (nbaddata) {
+               case 1:
+                       if (parity_valid[VDEV_RAIDZ_P])
+                               return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+                       ASSERT(rm->rm_firstdatacol > 1);
+
+                       if (parity_valid[VDEV_RAIDZ_Q])
+                               return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+                       ASSERT(rm->rm_firstdatacol > 2);
+                       break;
+
+               case 2:
+                       ASSERT(rm->rm_firstdatacol > 1);
+
+                       if (parity_valid[VDEV_RAIDZ_P] &&
+                           parity_valid[VDEV_RAIDZ_Q])
+                               return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+                       ASSERT(rm->rm_firstdatacol > 2);
+
+                       break;
+               }
+       }
+
+       code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+       ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+       ASSERT(code > 0);
+       return (code);
+}
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
        vdev_t *cvd;
        uint64_t nparity = vd->vdev_nparity;
-       int c, error;
+       int c;
        int lasterror = 0;
        int numerrors = 0;
 
@@ -573,11 +1270,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
                return (EINVAL);
        }
 
+       vdev_open_children(vd);
+
        for (c = 0; c < vd->vdev_children; c++) {
                cvd = vd->vdev_child[c];
 
-               if ((error = vdev_open(cvd)) != 0) {
-                       lasterror = error;
+               if (cvd->vdev_open_error != 0) {
+                       lasterror = cvd->vdev_open_error;
                        numerrors++;
                        continue;
                }
@@ -639,7 +1338,7 @@ vdev_raidz_io_start(zio_t *zio)
        blkptr_t *bp = zio->io_bp;
        raidz_map_t *rm;
        raidz_col_t *rc;
-       int c;
+       int c, i;
 
        rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
            vd->vdev_nparity);
@@ -647,13 +1346,7 @@ vdev_raidz_io_start(zio_t *zio)
        ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
        if (zio->io_type == ZIO_TYPE_WRITE) {
-               /*
-                * Generate RAID parity in the first virtual columns.
-                */
-               if (rm->rm_firstdatacol == 1)
-                       vdev_raidz_generate_parity_p(rm);
-               else
-                       vdev_raidz_generate_parity_pq(rm);
+               vdev_raidz_generate_parity(rm);
 
                for (c = 0; c < rm->rm_cols; c++) {
                        rc = &rm->rm_col[c];
@@ -664,6 +1357,23 @@ vdev_raidz_io_start(zio_t *zio)
                            vdev_raidz_child_done, rc));
                }
 
+               /*
+                * Generate optional I/Os for any skipped sectors to improve
+                * aggregation contiguity.
+                */
+               for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
+                       ASSERT(c <= rm->rm_scols);
+                       if (c == rm->rm_scols)
+                               c = 0;
+                       rc = &rm->rm_col[c];
+                       cvd = vd->vdev_child[rc->rc_devidx];
+                       zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+                           rc->rc_offset + rc->rc_size, NULL,
+                           1 << tvd->vdev_ashift,
+                           zio->io_type, zio->io_priority,
+                           ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+               }
+
                return (ZIO_PIPELINE_CONTINUE);
        }
 
@@ -671,8 +1381,7 @@ vdev_raidz_io_start(zio_t *zio)
 
        /*
         * Iterate over the columns in reverse order so that we hit the parity
-        * last -- any errors along the way will force us to read the parity
-        * data.
+        * last -- any errors along the way will force us to read the parity.
         */
        for (c = rm->rm_cols - 1; c >= 0; c--) {
                rc = &rm->rm_col[c];
@@ -748,10 +1457,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
                bcopy(rc->rc_data, orig[c], rc->rc_size);
        }
 
-       if (rm->rm_firstdatacol == 1)
-               vdev_raidz_generate_parity_p(rm);
-       else
-               vdev_raidz_generate_parity_pq(rm);
+       vdev_raidz_generate_parity(rm);
 
        for (c = 0; c < rm->rm_firstdatacol; c++) {
                rc = &rm->rm_col[c];
@@ -768,9 +1474,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
        return (ret);
 }
 
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
@@ -783,19 +1490,176 @@ vdev_raidz_worst_error(raidz_map_t *rm)
        return (error);
 }
 
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+       raidz_map_t *rm = zio->io_vsd;
+       raidz_col_t *rc;
+       void *orig[VDEV_RAIDZ_MAXPARITY];
+       int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+       int *tgts = &tstore[1];
+       int current, next, i, c, n;
+       int code, ret = 0;
+
+       ASSERT(total_errors < rm->rm_firstdatacol);
+
+       /*
+        * This simplifies one edge condition.
+        */
+       tgts[-1] = -1;
+
+       for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+               /*
+                * Initialize the targets array by finding the first n columns
+                * that contain no error.
+                *
+                * If there were no data errors, we need to ensure that we're
+                * always explicitly attempting to reconstruct at least one
+                * data column. To do this, we simply push the highest target
+                * up into the data columns.
+                */
+               for (c = 0, i = 0; i < n; i++) {
+                       if (i == n - 1 && data_errors == 0 &&
+                           c < rm->rm_firstdatacol) {
+                               c = rm->rm_firstdatacol;
+                       }
+
+                       while (rm->rm_col[c].rc_error != 0) {
+                               c++;
+                               ASSERT3S(c, <, rm->rm_cols);
+                       }
+
+                       tgts[i] = c++;
+               }
+
+               /*
+                * Setting tgts[n] simplifies the other edge condition.
+                */
+               tgts[n] = rm->rm_cols;
+
+               /*
+                * These buffers were allocated in previous iterations.
+                */
+               for (i = 0; i < n - 1; i++) {
+                       ASSERT(orig[i] != NULL);
+               }
+
+               orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+               current = 0;
+               next = tgts[current];
+
+               while (current != n) {
+                       tgts[current] = next;
+                       current = 0;
+
+                       /*
+                        * Save off the original data that we're going to
+                        * attempt to reconstruct.
+                        */
+                       for (i = 0; i < n; i++) {
+                               ASSERT(orig[i] != NULL);
+                               c = tgts[i];
+                               ASSERT3S(c, >=, 0);
+                               ASSERT3S(c, <, rm->rm_cols);
+                               rc = &rm->rm_col[c];
+                               bcopy(rc->rc_data, orig[i], rc->rc_size);
+                       }
+
+                       /*
+                        * Attempt a reconstruction and exit the outer loop on
+                        * success.
+                        */
+                       code = vdev_raidz_reconstruct(rm, tgts, n);
+                       if (zio_checksum_error(zio) == 0) {
+                               atomic_inc_64(&raidz_corrected[code]);
+
+                               for (i = 0; i < n; i++) {
+                                       c = tgts[i];
+                                       rc = &rm->rm_col[c];
+                                       ASSERT(rc->rc_error == 0);
+                                       if (rc->rc_tried)
+                                               raidz_checksum_error(zio, rc);
+                                       rc->rc_error = ECKSUM;
+                               }
+
+                               ret = code;
+                               goto done;
+                       }
+
+                       /*
+                        * Restore the original data.
+                        */
+                       for (i = 0; i < n; i++) {
+                               c = tgts[i];
+                               rc = &rm->rm_col[c];
+                               bcopy(orig[i], rc->rc_data, rc->rc_size);
+                       }
+
+                       do {
+                               /*
+                                * Find the next valid column after the current
+                                * position..
+                                */
+                               for (next = tgts[current] + 1;
+                                   next < rm->rm_cols &&
+                                   rm->rm_col[next].rc_error != 0; next++)
+                                       continue;
+
+                               ASSERT(next <= tgts[current + 1]);
+
+                               /*
+                                * If that spot is available, we're done here.
+                                */
+                               if (next != tgts[current + 1])
+                                       break;
+
+                               /*
+                                * Otherwise, find the next valid column after
+                                * the previous position.
+                                */
+                               for (c = tgts[current - 1] + 1;
+                                   rm->rm_col[c].rc_error != 0; c++)
+                                       continue;
+
+                               tgts[current] = c;
+                               current++;
+
+                       } while (current != n);
+               }
+       }
+       n--;
+done:
+       for (i = 0; i < n; i++) {
+               zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+       }
+
+       return (ret);
+}
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
        vdev_t *vd = zio->io_vd;
        vdev_t *cvd;
        raidz_map_t *rm = zio->io_vsd;
-       raidz_col_t *rc, *rc1;
+       raidz_col_t *rc;
        int unexpected_errors = 0;
        int parity_errors = 0;
        int parity_untried = 0;
        int data_errors = 0;
        int total_errors = 0;
-       int n, c, c1;
+       int n, c;
+       int tgts[VDEV_RAIDZ_MAXPARITY];
+       int code;
 
        ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
@@ -859,8 +1723,7 @@ vdev_raidz_io_done(zio_t *zio)
         * any errors.
         */
        if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-               switch (data_errors) {
-               case 0:
+               if (data_errors == 0) {
                        if (zio_checksum_error(zio) == 0) {
                                /*
                                 * If we read parity information (unnecessarily
@@ -880,9 +1743,7 @@ vdev_raidz_io_done(zio_t *zio)
                                }
                                goto done;
                        }
-                       break;
-
-               case 1:
+               } else {
                        /*
                         * We either attempt to read all the parity columns or
                         * none of them. If we didn't try to read parity, we
@@ -894,45 +1755,38 @@ vdev_raidz_io_done(zio_t *zio)
                        ASSERT(parity_errors < rm->rm_firstdatacol);
 
                        /*
-                        * Find the column that reported the error.
+                        * Identify the data columns that reported an error.
                         */
+                       n = 0;
                        for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
                                rc = &rm->rm_col[c];
-                               if (rc->rc_error != 0)
-                                       break;
+                               if (rc->rc_error != 0) {
+                                       ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+                                       tgts[n++] = c;
+                               }
                        }
-                       ASSERT(c != rm->rm_cols);
-                       ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-                           rc->rc_error == ESTALE);
 
-                       if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-                               vdev_raidz_reconstruct_p(rm, c);
-                       } else {
-                               ASSERT(rm->rm_firstdatacol > 1);
-                               vdev_raidz_reconstruct_q(rm, c);
-                       }
+                       ASSERT(rm->rm_firstdatacol >= n);
+
+                       code = vdev_raidz_reconstruct(rm, tgts, n);
 
                        if (zio_checksum_error(zio) == 0) {
-                               if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
-                                       atomic_inc_64(&raidz_corrected_p);
-                               else
-                                       atomic_inc_64(&raidz_corrected_q);
+                               atomic_inc_64(&raidz_corrected[code]);
 
                                /*
-                                * If there's more than one parity disk that
-                                * was successfully read, confirm that the
-                                * other parity disk produced the correct data.
-                                * This routine is suboptimal in that it
-                                * regenerates both the parity we wish to test
-                                * as well as the parity we just used to
-                                * perform the reconstruction, but this should
-                                * be a relatively uncommon case, and can be
-                                * optimized if it becomes a problem.
-                                * We also regenerate parity when resilvering
-                                * so we can write it out to the failed device
-                                * later.
+                                * If we read more parity disks than were used
+                                * for reconstruction, confirm that the other
+                                * parity disks produced correct data. This
+                                * routine is suboptimal in that it regenerates
+                                * the parity that we already used in addition
+                                * to the parity that we're attempting to
+                                * verify, but this should be a relatively
+                                * uncommon case, and can be optimized if it
+                                * becomes a problem. Note that we regenerate
+                                * parity when resilvering so we can write it
+                                * out to failed devices later.
                                 */
-                               if (parity_errors < rm->rm_firstdatacol - 1 ||
+                               if (parity_errors < rm->rm_firstdatacol - n ||
                                    (zio->io_flags & ZIO_FLAG_RESILVER)) {
                                        n = raidz_parity_verify(zio, rm);
                                        unexpected_errors += n;
@@ -942,46 +1796,6 @@ vdev_raidz_io_done(zio_t *zio)
 
                                goto done;
                        }
-                       break;
-
-               case 2:
-                       /*
-                        * Two data column errors require double parity.
-                        */
-                       ASSERT(rm->rm_firstdatacol == 2);
-
-                       /*
-                        * Find the two columns that reported errors.
-                        */
-                       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-                               rc = &rm->rm_col[c];
-                               if (rc->rc_error != 0)
-                                       break;
-                       }
-                       ASSERT(c != rm->rm_cols);
-                       ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-                           rc->rc_error == ESTALE);
-
-                       for (c1 = c++; c < rm->rm_cols; c++) {
-                               rc = &rm->rm_col[c];
-                               if (rc->rc_error != 0)
-                                       break;
-                       }
-                       ASSERT(c != rm->rm_cols);
-                       ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-                           rc->rc_error == ESTALE);
-
-                       vdev_raidz_reconstruct_pq(rm, c1, c);
-
-                       if (zio_checksum_error(zio) == 0) {
-                               atomic_inc_64(&raidz_corrected_pq);
-                               goto done;
-                       }
-                       break;
-
-               default:
-                       ASSERT(rm->rm_firstdatacol <= 2);
-                       ASSERT(0);
                }
        }
 
@@ -1020,8 +1834,10 @@ vdev_raidz_io_done(zio_t *zio)
         * errors we detected, and we've attempted to read all columns. There
         * must, therefore, be one or more additional problems -- silent errors
         * resulting in invalid data rather than explicit I/O errors resulting
-        * in absent data. Before we attempt combinatorial reconstruction make
-        * sure we have a chance of coming up with the right answer.
+        * in absent data. We check if there is enough additional data to
+        * possibly reconstruct the data and then perform combinatorial
+        * reconstruction over all possible combinations. If that fails,
+        * we're cooked.
         */
        if (total_errors >= rm->rm_firstdatacol) {
                zio->io_error = vdev_raidz_worst_error(rm);
@@ -1032,133 +1848,30 @@ vdev_raidz_io_done(zio_t *zio)
                 */
                if (total_errors == rm->rm_firstdatacol)
                        zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
-               goto done;
-       }
 
-       if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+       } else if ((code = vdev_raidz_combrec(zio, total_errors,
+           data_errors)) != 0) {
                /*
-                * Attempt to reconstruct the data from parity P.
+                * If we didn't use all the available parity for the
+                * combinatorial reconstruction, verify that the remaining
+                * parity is correct.
                 */
-               for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-                       void *orig;
-                       rc = &rm->rm_col[c];
-
-                       orig = zio_buf_alloc(rc->rc_size);
-                       bcopy(rc->rc_data, orig, rc->rc_size);
-                       vdev_raidz_reconstruct_p(rm, c);
-
-                       if (zio_checksum_error(zio) == 0) {
-                               zio_buf_free(orig, rc->rc_size);
-                               atomic_inc_64(&raidz_corrected_p);
-
-                               /*
-                                * If this child didn't know that it returned
-                                * bad data, inform it.
-                                */
-                               if (rc->rc_tried && rc->rc_error == 0)
-                                       raidz_checksum_error(zio, rc);
-                               rc->rc_error = ECKSUM;
-                               goto done;
-                       }
-
-                       bcopy(orig, rc->rc_data, rc->rc_size);
-                       zio_buf_free(orig, rc->rc_size);
-               }
-       }
-
-       if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+               if (code != (1 << rm->rm_firstdatacol) - 1)
+                       (void) raidz_parity_verify(zio, rm);
+       } else {
                /*
-                * Attempt to reconstruct the data from parity Q.
+                * All combinations failed to checksum. Generate checksum
+                * ereports for all children.
                 */
-               for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-                       void *orig;
-                       rc = &rm->rm_col[c];
-
-                       orig = zio_buf_alloc(rc->rc_size);
-                       bcopy(rc->rc_data, orig, rc->rc_size);
-                       vdev_raidz_reconstruct_q(rm, c);
-
-                       if (zio_checksum_error(zio) == 0) {
-                               zio_buf_free(orig, rc->rc_size);
-                               atomic_inc_64(&raidz_corrected_q);
-
-                               /*
-                                * If this child didn't know that it returned
-                                * bad data, inform it.
-                                */
-                               if (rc->rc_tried && rc->rc_error == 0)
-                                       raidz_checksum_error(zio, rc);
-                               rc->rc_error = ECKSUM;
-                               goto done;
-                       }
-
-                       bcopy(orig, rc->rc_data, rc->rc_size);
-                       zio_buf_free(orig, rc->rc_size);
-               }
-       }
+               zio->io_error = ECKSUM;
 
-       if (rm->rm_firstdatacol > 1 &&
-           rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
-           rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
-               /*
-                * Attempt to reconstruct the data from both P and Q.
-                */
-               for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
-                       void *orig, *orig1;
-                       rc = &rm->rm_col[c];
-
-                       orig = zio_buf_alloc(rc->rc_size);
-                       bcopy(rc->rc_data, orig, rc->rc_size);
-
-                       for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
-                               rc1 = &rm->rm_col[c1];
-
-                               orig1 = zio_buf_alloc(rc1->rc_size);
-                               bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
-                               vdev_raidz_reconstruct_pq(rm, c, c1);
-
-                               if (zio_checksum_error(zio) == 0) {
-                                       zio_buf_free(orig, rc->rc_size);
-                                       zio_buf_free(orig1, rc1->rc_size);
-                                       atomic_inc_64(&raidz_corrected_pq);
-
-                                       /*
-                                        * If these children didn't know they
-                                        * returned bad data, inform them.
-                                        */
-                                       if (rc->rc_tried && rc->rc_error == 0)
-                                               raidz_checksum_error(zio, rc);
-                                       if (rc1->rc_tried && rc1->rc_error == 0)
-                                               raidz_checksum_error(zio, rc1);
-
-                                       rc->rc_error = ECKSUM;
-                                       rc1->rc_error = ECKSUM;
-
-                                       goto done;
-                               }
-
-                               bcopy(orig1, rc1->rc_data, rc1->rc_size);
-                               zio_buf_free(orig1, rc1->rc_size);
+               if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+                       for (c = 0; c < rm->rm_cols; c++) {
+                               rc = &rm->rm_col[c];
+                               zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+                                   zio->io_spa, vd->vdev_child[rc->rc_devidx],
+                                   zio, rc->rc_offset, rc->rc_size);
                        }
-
-                       bcopy(orig, rc->rc_data, rc->rc_size);
-                       zio_buf_free(orig, rc->rc_size);
-               }
-       }
-
-       /*
-        * All combinations failed to checksum. Generate checksum ereports for
-        * all children.
-        */
-       zio->io_error = ECKSUM;
-
-       if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-               for (c = 0; c < rm->rm_cols; c++) {
-                       rc = &rm->rm_col[c];
-                       zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-                           zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
-                           rc->rc_offset, rc->rc_size);
                }
        }
 
index 88383f0..524c8e6 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors)
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
-       int c;
        int lasterror = 0;
        int numerrors = 0;
 
@@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
                return (EINVAL);
        }
 
-       for (c = 0; c < vd->vdev_children; c++) {
+       vdev_open_children(vd);
+
+       for (int c = 0; c < vd->vdev_children; c++) {
                vdev_t *cvd = vd->vdev_child[c];
-               int error;
 
-               if ((error = vdev_open(cvd)) != 0 &&
-                   !cvd->vdev_islog) {
-                       lasterror = error;
+               if (cvd->vdev_open_error && !cvd->vdev_islog) {
+                       lasterror = cvd->vdev_open_error;
                        numerrors++;
-                       continue;
                }
        }
 
@@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 static void
 vdev_root_close(vdev_t *vd)
 {
-       int c;
-
-       for (c = 0; c < vd->vdev_children; c++)
+       for (int c = 0; c < vd->vdev_children; c++)
                vdev_close(vd->vdev_child[c]);
 }
 
index fbc93b4..528d31d 100644 (file)
@@ -1068,7 +1068,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 
 int
 zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
-    uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift)
+    uint64_t *towrite, uint64_t *tooverwrite)
 {
        zap_t *zap;
        int err = 0;
@@ -1113,28 +1113,28 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                        *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
                }
        } else {
-               if (!add) {
-                       if (dmu_buf_freeable(zap->zap_dbuf))
-                               *tooverwrite += SPA_MAXBLOCKSIZE;
-                       else
-                               *towrite += SPA_MAXBLOCKSIZE;
-               } else {
-                       /*
-                        * We are here if we are adding and (name != NULL).
-                        * It is hard to find out if this add will promote this
-                        * microzap to fatzap. Hence, we assume the worst case
-                        * and account for the blocks assuming this microzap
-                        * would be promoted to a fatzap.
-                        *
-                        * 1 block overwritten  : header block
-                        * 4 new blocks written : 2 new split leaf, 2 grown
-                        *                      ptrtbl blocks
-                        */
-                       if (dmu_buf_freeable(zap->zap_dbuf))
-                               *tooverwrite += 1 << dn_datablkshift;
-                       else
-                               *towrite += 1 << dn_datablkshift;
-                       *towrite += 4 << dn_datablkshift;
+               /*
+                * We are here if (name != NULL) and this is a micro-zap.
+                * We account for the header block depending on whether it
+                * is freeable.
+                *
+                * Incase of an add-operation it is hard to find out
+                * if this add will promote this microzap to fatzap.
+                * Hence, we consider the worst case and account for the
+                * blocks assuming this microzap would be promoted to a
+                * fatzap.
+                *
+                * 1 block overwritten  : header block
+                * 4 new blocks written : 2 new split leaf, 2 grown
+                *                      ptrtbl blocks
+                */
+               if (dmu_buf_freeable(zap->zap_dbuf))
+                       *tooverwrite += SPA_MAXBLOCKSIZE;
+               else
+                       *towrite += SPA_MAXBLOCKSIZE;
+
+               if (add) {
+                       *towrite += 4 * SPA_MAXBLOCKSIZE;
                }
        }
 
index 734bd83..12ffe9f 100644 (file)
@@ -93,6 +93,8 @@
 #define        ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
     ZFS_ACL_OBJ_ACE)
 
+#define        ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
 static uint16_t
 zfs_ace_v0_get_type(void *acep)
 {
@@ -781,6 +783,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
        uint64_t        who;
        uint16_t        iflags, type;
        uint32_t        access_mask;
+       boolean_t       an_exec_denied = B_FALSE;
 
        mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
 
@@ -905,8 +908,32 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
                                        }
                                }
                        }
+               } else {
+                       /*
+                        * Only care if this IDENTIFIER_GROUP or
+                        * USER ACE denies execute access to someone,
+                        * mode is not affected
+                        */
+                       if ((access_mask & ACE_EXECUTE) && type == DENY)
+                               an_exec_denied = B_TRUE;
                }
        }
+
+       /*
+        * Failure to allow is effectively a deny, so execute permission
+        * is denied if it was never mentioned or if we explicitly
+        * weren't allowed it.
+        */
+       if (!an_exec_denied &&
+           ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+           (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+               an_exec_denied = B_TRUE;
+
+       if (an_exec_denied)
+               zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+       else
+               zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+
        return (mode);
 }
 
@@ -946,7 +973,8 @@ zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
 }
 
 /*
- * Read an external acl object.
+ * Read an external acl object.  If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
  */
 static int
 zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
@@ -960,8 +988,15 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 
        ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
+       if (zp->z_acl_cached && !will_modify) {
+               *aclpp = zp->z_acl_cached;
+               return (0);
+       }
+
        if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
                *aclpp = zfs_acl_node_read_internal(zp, will_modify);
+               if (!will_modify)
+                       zp->z_acl_cached = *aclpp;
                return (0);
        }
 
@@ -995,6 +1030,8 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
        }
 
        *aclpp = aclp;
+       if (!will_modify)
+               zp->z_acl_cached = aclp;
        return (0);
 }
 
@@ -1019,11 +1056,16 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 
        dmu_buf_will_dirty(zp->z_dbuf, tx);
 
+       if (zp->z_acl_cached) {
+               zfs_acl_free(zp->z_acl_cached);
+               zp->z_acl_cached = NULL;
+       }
+
        zphys->zp_mode = zfs_mode_compute(zp, aclp);
 
        /*
-        * Decide which opbject type to use.  If we are forced to
-        * use old ACL format than transform ACL into zfs_oldace_t
+        * Decide which object type to use.  If we are forced to
+        * use old ACL format then transform ACL into zfs_oldace_t
         * layout.
         */
        if (!zfsvfs->z_use_fuids) {
@@ -1869,7 +1911,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                        mutex_exit(&dzp->z_acl_lock);
                        acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
                            vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
-                       zfs_acl_free(paclp);
                } else {
                        acl_ids->z_aclp =
                            zfs_acl_alloc(zfs_acl_version_zp(dzp));
@@ -1998,8 +2039,6 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 
        mutex_exit(&zp->z_acl_lock);
 
-       zfs_acl_free(aclp);
-
        return (0);
 }
 
@@ -2095,11 +2134,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
                aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
        }
 top:
-       if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
-               zfs_acl_free(aclp);
-               return (error);
-       }
-
        mutex_enter(&zp->z_lock);
        mutex_enter(&zp->z_acl_lock);
 
@@ -2145,6 +2179,7 @@ top:
 
        error = zfs_aclset_common(zp, aclp, cr, tx);
        ASSERT(error == 0);
+       zp->z_acl_cached = aclp;
 
        if (fuid_dirtied)
                zfs_fuid_sync(zfsvfs, tx);
@@ -2154,7 +2189,6 @@ top:
 
        if (fuidp)
                zfs_fuid_info_free(fuidp);
-       zfs_acl_free(aclp);
        dmu_tx_commit(tx);
 done:
        mutex_exit(&zp->z_acl_lock);
@@ -2301,7 +2335,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
                                        checkit = B_TRUE;
                                break;
                        } else {
-                               zfs_acl_free(aclp);
                                mutex_exit(&zp->z_acl_lock);
                                return (EIO);
                        }
@@ -2321,7 +2354,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
                                    uint32_t, mask_matched);
                                if (anyaccess) {
                                        mutex_exit(&zp->z_acl_lock);
-                                       zfs_acl_free(aclp);
                                        return (0);
                                }
                        }
@@ -2334,7 +2366,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
        }
 
        mutex_exit(&zp->z_acl_lock);
-       zfs_acl_free(aclp);
 
        /* Put the found 'denies' back on the working mode */
        if (deny_mask) {
@@ -2366,8 +2397,7 @@ zfs_has_access(znode_t *zp, cred_t *cr)
                    secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
                    secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
                    secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
-                   secpolicy_vnode_chown(cr, B_TRUE) == 0 ||
-                   secpolicy_vnode_chown(cr, B_FALSE) == 0 ||
+                   secpolicy_vnode_chown(cr, owner) == 0 ||
                    secpolicy_vnode_setdac(cr, owner) == 0 ||
                    secpolicy_vnode_remove(cr) == 0);
        }
@@ -2421,6 +2451,78 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
            check_privs, B_FALSE, cr));
 }
 
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+       boolean_t owner = B_FALSE;
+       boolean_t groupmbr = B_FALSE;
+       boolean_t is_attr;
+       uid_t fowner;
+       uid_t gowner;
+       uid_t uid = crgetuid(cr);
+       int error;
+
+       if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+               return (EACCES);
+
+       is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+           (ZTOV(zdp)->v_type == VDIR));
+       if (is_attr)
+               goto slow;
+
+       mutex_enter(&zdp->z_acl_lock);
+
+       if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+               mutex_exit(&zdp->z_acl_lock);
+               return (0);
+       }
+
+       if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
+           FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+               mutex_exit(&zdp->z_acl_lock);
+               goto slow;
+       }
+
+       fowner = (uid_t)zdp->z_phys->zp_uid;
+       gowner = (uid_t)zdp->z_phys->zp_gid;
+
+       if (uid == fowner) {
+               owner = B_TRUE;
+               if (zdp->z_phys->zp_mode & S_IXUSR) {
+                       mutex_exit(&zdp->z_acl_lock);
+                       return (0);
+               } else {
+                       mutex_exit(&zdp->z_acl_lock);
+                       goto slow;
+               }
+       }
+       if (groupmember(gowner, cr)) {
+               groupmbr = B_TRUE;
+               if (zdp->z_phys->zp_mode & S_IXGRP) {
+                       mutex_exit(&zdp->z_acl_lock);
+                       return (0);
+               } else {
+                       mutex_exit(&zdp->z_acl_lock);
+                       goto slow;
+               }
+       }
+       if (!owner && !groupmbr) {
+               if (zdp->z_phys->zp_mode & S_IXOTH) {
+                       mutex_exit(&zdp->z_acl_lock);
+                       return (0);
+               }
+       }
+
+       mutex_exit(&zdp->z_acl_lock);
+
+slow:
+       DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+       ZFS_ENTER(zdp->z_zfsvfs);
+       error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+       ZFS_EXIT(zdp->z_zfsvfs);
+       return (error);
+}
+
 /*
  * Determine whether Access should be granted/denied, invoking least
  * priv subsytem when a deny is determined.
@@ -2515,7 +2617,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                            owner, checkmode);
 
                if (error == 0 && (working_mode & ACE_WRITE_OWNER))
-                       error = secpolicy_vnode_chown(cr, B_TRUE);
+                       error = secpolicy_vnode_chown(cr, owner);
                if (error == 0 && (working_mode & ACE_WRITE_ACL))
                        error = secpolicy_vnode_setdac(cr, owner);
 
@@ -2524,7 +2626,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                        error = secpolicy_vnode_remove(cr);
 
                if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
-                       error = secpolicy_vnode_chown(cr, B_FALSE);
+                       error = secpolicy_vnode_chown(cr, owner);
                }
                if (error == 0) {
                        /*
index 27c2c51..c6c7198 100644 (file)
@@ -700,7 +700,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
                if (err)
                        avl_add(&sdp->sd_snaps, sep);
                else
-                       err = dmu_objset_destroy(snapname);
+                       err = dmu_objset_destroy(snapname, B_FALSE);
        } else {
                err = ENOENT;
        }
index 8e481df..e704b1c 100644 (file)
@@ -353,6 +353,7 @@ retry:
                rw_exit(&zfsvfs->z_fuid_lock);
                return (retidx);
        } else {
+               rw_exit(&zfsvfs->z_fuid_lock);
                return (-1);
        }
 }
index 07dd03c..9cb4081 100644 (file)
@@ -761,6 +761,20 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
        return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
 }
 
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr)
+{
+       return (zfs_secpolicy_write_perms(zc->zc_name,
+           ZFS_DELEG_PERM_HOLD, cr));
+}
+
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
+{
+       return (zfs_secpolicy_write_perms(zc->zc_name,
+           ZFS_DELEG_PERM_RELEASE, cr));
+}
+
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
@@ -2466,7 +2480,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
         */
        if (error == 0) {
                if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
-                       (void) dmu_objset_destroy(zc->zc_name);
+                       (void) dmu_objset_destroy(zc->zc_name, B_FALSE);
        }
        nvlist_free(nvprops);
        return (error);
@@ -2553,8 +2567,9 @@ zfs_unmount_snap(char *name, void *arg)
 
 /*
  * inputs:
- * zc_name     name of filesystem
- * zc_value    short name of snapshot
+ * zc_name             name of filesystem
+ * zc_value            short name of snapshot
+ * zc_defer_destroy    mark for deferred destroy
  *
  * outputs:    none
  */
@@ -2569,13 +2584,15 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
            zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
        if (err)
                return (err);
-       return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+       return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value,
+           zc->zc_defer_destroy));
 }
 
 /*
  * inputs:
  * zc_name             name of dataset to destroy
  * zc_objset_type      type of objset
+ * zc_defer_destroy    mark for deferred destroy
  *
  * outputs:            none
  */
@@ -2588,7 +2605,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
                        return (err);
        }
 
-       return (dmu_objset_destroy(zc->zc_name));
+       return (dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy));
 }
 
 /*
@@ -2708,7 +2725,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        file_t *fp;
        objset_t *os;
        dmu_recv_cookie_t drc;
-       zfsvfs_t *zfsvfs = NULL;
        boolean_t force = (boolean_t)zc->zc_guid;
        int error, fd;
        offset_t off;
@@ -2740,25 +2756,12 @@ zfs_ioc_recv(zfs_cmd_t *zc)
                return (EBADF);
        }
 
-       if (getzfsvfs(tofs, &zfsvfs) == 0) {
-               if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
-                       VFS_RELE(zfsvfs->z_vfs);
-                       zfsvfs = NULL;
-                       error = EBUSY;
-                       goto out;
-               }
+       if (props && dmu_objset_open(tofs, DMU_OST_ANY,
+           DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
                /*
                 * If new properties are supplied, they are to completely
                 * replace the existing ones, so stash away the existing ones.
                 */
-               if (props)
-                       (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE);
-       } else if (props && dmu_objset_open(tofs, DMU_OST_ANY,
-           DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-               /*
-                * Get the props even if there was no zfsvfs (zvol or
-                * unmounted zpl).
-                */
                (void) dsl_prop_get_all(os, &origprops, TRUE);
 
                dmu_objset_close(os);
@@ -2772,7 +2775,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        }
 
        error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
-           force, origin, zfsvfs != NULL, &drc);
+           force, origin, &drc);
        if (origin)
                dmu_objset_close(origin);
        if (error)
@@ -2793,25 +2796,33 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        off = fp->f_offset;
        error = dmu_recv_stream(&drc, fp->f_vnode, &off);
 
-       if (error == 0 && zfsvfs) {
-               char *osname;
-               int mode;
+       if (error == 0) {
+               zfsvfs_t *zfsvfs = NULL;
 
-               /* online recv */
-               osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-               error = zfs_suspend_fs(zfsvfs, osname, &mode);
-               if (error == 0) {
-                       int resume_err;
+               if (getzfsvfs(tofs, &zfsvfs) == 0) {
+                       /* online recv */
+                       int end_err;
+                       char *osname;
+                       int mode;
 
-                       error = dmu_recv_end(&drc);
-                       resume_err = zfs_resume_fs(zfsvfs, osname, mode);
-                       error = error ? error : resume_err;
+                       osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+                       error = zfs_suspend_fs(zfsvfs, osname, &mode);
+                       /*
+                        * If the suspend fails, then the recv_end will
+                        * likely also fail, and clean up after itself.
+                        */
+                       end_err = dmu_recv_end(&drc);
+                       if (error == 0) {
+                               int resume_err =
+                                   zfs_resume_fs(zfsvfs, osname, mode);
+                               error = error ? error : resume_err;
+                       }
+                       error = error ? error : end_err;
+                       VFS_RELE(zfsvfs->z_vfs);
+                       kmem_free(osname, MAXNAMELEN);
                } else {
-                       dmu_recv_abort_cleanup(&drc);
+                       error = dmu_recv_end(&drc);
                }
-               kmem_free(osname, MAXNAMELEN);
-       } else if (error == 0) {
-               error = dmu_recv_end(&drc);
        }
 
        zc->zc_cookie = off - fp->f_offset;
@@ -2826,10 +2837,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
                (void) zfs_set_prop_nvlist(tofs, origprops);
        }
 out:
-       if (zfsvfs) {
-               mutex_exit(&zfsvfs->z_online_recv_lock);
-               VFS_RELE(zfsvfs->z_vfs);
-       }
        nvlist_free(props);
        nvlist_free(origprops);
        releasef(fd);
@@ -3432,6 +3439,69 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 }
 
 /*
+ * inputs:
+ * zc_name     name of filesystem
+ * zc_value    short name of snap
+ * zc_string   user-supplied tag for this reference
+ * zc_cookie   recursive flag
+ *
+ * outputs:            none
+ */
+static int
+zfs_ioc_hold(zfs_cmd_t *zc)
+{
+       boolean_t recursive = zc->zc_cookie;
+
+       if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+               return (EINVAL);
+
+       return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+           zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name     name of dataset from which we're releasing a user reference
+ * zc_value    short name of snap
+ * zc_string   user-supplied tag for this reference
+ * zc_cookie   recursive flag
+ *
+ * outputs:            none
+ */
+static int
+zfs_ioc_release(zfs_cmd_t *zc)
+{
+       boolean_t recursive = zc->zc_cookie;
+
+       if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+               return (EINVAL);
+
+       return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
+           zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name             name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size}        nvlist of snapshot holds
+ */
+static int
+zfs_ioc_get_holds(zfs_cmd_t *zc)
+{
+       nvlist_t *nvp;
+       int error;
+
+       if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
+               error = put_nvlist(zc, nvp);
+               nvlist_free(nvp);
+       }
+
+       return (error);
+}
+
+/*
  * pool create, destroy, and export don't log the history as part of
  * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
  * do the logging of those commands.
@@ -3511,8 +3581,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
            B_TRUE },
        { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
            B_FALSE },
-       { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE,
-           B_FALSE },
+       { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
+           B_TRUE },
        { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
            B_TRUE },
        { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
@@ -3534,6 +3604,11 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
            DATASET_NAME, B_FALSE, B_FALSE },
        { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
            DATASET_NAME, B_FALSE, B_TRUE },
+       { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+       { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
+           B_TRUE },
+       { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+           B_TRUE }
 };
 
 int
index 8a859b5..d03f92b 100644 (file)
@@ -935,7 +935,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
                goto out;
 
        mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
@@ -1051,7 +1050,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
        zfs_fuid_destroy(zfsvfs);
 
        mutex_destroy(&zfsvfs->z_znodes_lock);
-       mutex_destroy(&zfsvfs->z_online_recv_lock);
        mutex_destroy(&zfsvfs->z_lock);
        list_destroy(&zfsvfs->z_all_znodes);
        rrw_destroy(&zfsvfs->z_teardown_lock);
index 88d4e52..8eb4665 100644 (file)
@@ -208,6 +208,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
        znode_t *zp = VTOZ(vp);
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
+       /*
+        * Clean up any locks held by this process on the vp.
+        */
+       cleanlocks(vp, ddi_get_pid(), 0);
+       cleanshares(vp, ddi_get_pid());
+
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
@@ -215,12 +221,6 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
        if ((flag & (FSYNC | FDSYNC)) && (count == 1))
                atomic_dec_32(&zp->z_sync_cnt);
 
-       /*
-        * Clean up any locks held by this process on the vp.
-        */
-       cleanlocks(vp, ddi_get_pid(), 0);
-       cleanshares(vp, ddi_get_pid());
-
        if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
            ZTOV(zp)->v_type == VREG &&
            !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
@@ -855,6 +855,10 @@ zfs_get_done(dmu_buf_t *db, void *vzgd)
        kmem_free(zgd, sizeof (zgd_t));
 }
 
+#ifdef DEBUG
+static int zil_fault_io = 0;
+#endif
+
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
@@ -936,7 +940,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
                zgd->zgd_rl = rl;
                zgd->zgd_zilog = zfsvfs->z_log;
                zgd->zgd_bp = &lr->lr_blkptr;
-               VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
+#ifdef DEBUG
+               if (zil_fault_io) {
+                       error = EIO;
+                       zil_fault_io = 0;
+               } else {
+                       error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
+               }
+#else
+               error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
+#endif
+               if (error != 0) {
+                       kmem_free(zgd, sizeof (zgd_t));
+                       goto out;
+               }
+
                ASSERT(boff == db->db_offset);
                lr->lr_blkoff = off - boff;
                error = dmu_sync(zio, db, &lr->lr_blkptr,
@@ -988,6 +1006,27 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
 }
 
 /*
+ * If vnode is for a device return a specfs vnode instead.
+ */
+static int
+specvp_check(vnode_t **vpp, cred_t *cr)
+{
+       int error = 0;
+
+       if (IS_DEVVP(*vpp)) {
+               struct vnode *svp;
+
+               svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+               VN_RELE(*vpp);
+               if (svp == NULL)
+                       error = ENOSYS;
+               *vpp = svp;
+       }
+       return (error);
+}
+
+
+/*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
@@ -1017,7 +1056,46 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
 {
        znode_t *zdp = VTOZ(dvp);
        zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-       int     error;
+       int     error = 0;
+
+       /* fast path */
+       if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+               if (dvp->v_type != VDIR) {
+                       return (ENOTDIR);
+               } else if (zdp->z_dbuf == NULL) {
+                       return (EIO);
+               }
+
+               if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+                       error = zfs_fastaccesschk_execute(zdp, cr);
+                       if (!error) {
+                               *vpp = dvp;
+                               VN_HOLD(*vpp);
+                               return (0);
+                       }
+                       return (error);
+               } else {
+                       vnode_t *tvp = dnlc_lookup(dvp, nm);
+
+                       if (tvp) {
+                               error = zfs_fastaccesschk_execute(zdp, cr);
+                               if (error) {
+                                       VN_RELE(tvp);
+                                       return (error);
+                               }
+                               if (tvp == DNLC_NO_VNODE) {
+                                       VN_RELE(tvp);
+                                       return (ENOENT);
+                               } else {
+                                       *vpp = tvp;
+                                       return (specvp_check(vpp, cr));
+                               }
+                       }
+               }
+       }
+
+       DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zdp);
@@ -1082,21 +1160,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
        }
 
        error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-       if (error == 0) {
-               /*
-                * Convert device special files
-                */
-               if (IS_DEVVP(*vpp)) {
-                       vnode_t *svp;
-
-                       svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-                       VN_RELE(*vpp);
-                       if (svp == NULL)
-                               error = ENOSYS;
-                       else
-                               *vpp = svp;
-               }
-       }
+       if (error == 0)
+               error = specvp_check(vpp, cr);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -1235,6 +1300,7 @@ top:
                    &acl_ids)) != 0)
                        goto out;
                if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+                       zfs_acl_ids_free(&acl_ids);
                        error = EDQUOT;
                        goto out;
                }
@@ -1332,19 +1398,7 @@ out:
                        VN_RELE(ZTOV(zp));
        } else {
                *vpp = ZTOV(zp);
-               /*
-                * If vnode is for a device return a specfs vnode instead.
-                */
-               if (IS_DEVVP(*vpp)) {
-                       struct vnode *svp;
-
-                       svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-                       VN_RELE(*vpp);
-                       if (svp == NULL) {
-                               error = ENOSYS;
-                       }
-                       *vpp = svp;
-               }
+               error = specvp_check(vpp, cr);
        }
 
        ZFS_EXIT(zfsvfs);
@@ -1653,6 +1707,7 @@ top:
                return (error);
        }
        if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+               zfs_acl_ids_free(&acl_ids);
                zfs_dirent_unlock(dl);
                ZFS_EXIT(zfsvfs);
                return (EDQUOT);
@@ -2456,6 +2511,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 top:
        attrzp = NULL;
 
+       /* Can this be moved to before the top label? */
        if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
                ZFS_EXIT(zfsvfs);
                return (EROFS);
@@ -2765,6 +2821,8 @@ top:
                zp->z_phys->zp_mode = new_mode;
                err = zfs_aclset_common(zp, aclp, cr, tx);
                ASSERT3U(err, ==, 0);
+               zp->z_acl_cached = aclp;
+               aclp = NULL;
                mutex_exit(&zp->z_acl_lock);
        }
 
@@ -2856,10 +2914,8 @@ out:
        if (attrzp)
                VN_RELE(ZTOV(attrzp));
 
-       if (aclp) {
+       if (aclp)
                zfs_acl_free(aclp);
-               aclp = NULL;
-       }
 
        if (fuidp) {
                zfs_fuid_info_free(fuidp);
@@ -3724,8 +3780,8 @@ top:
        if (err == 0) {
                zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
                zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
-               dmu_tx_commit(tx);
        }
+       dmu_tx_commit(tx);
 
 out:
        pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
index 8ced951..f99e72f 100644 (file)
@@ -133,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 
        zp->z_dbuf = NULL;
        zp->z_dirlocks = NULL;
+       zp->z_acl_cached = NULL;
        return (0);
 }
 
@@ -155,6 +156,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 
        ASSERT(zp->z_dbuf == NULL);
        ASSERT(zp->z_dirlocks == NULL);
+       ASSERT(zp->z_acl_cached == NULL);
 }
 
 #ifdef ZNODE_STATS
@@ -199,6 +201,18 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
        nzp->z_phys = ozp->z_phys;
        nzp->z_dbuf = ozp->z_dbuf;
 
+       /*
+        * Release any cached ACL, since it *may* have
+        * zfs_acl_node_t's that directly references an
+        * embedded ACL in the zp_acl of the old znode_phys_t
+        *
+        * It will be recached the next time the ACL is needed.
+        */
+       if (ozp->z_acl_cached) {
+               zfs_acl_free(ozp->z_acl_cached);
+               ozp->z_acl_cached = NULL;
+       }
+
        /* Update back pointers. */
        (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
            znode_evict_error);
@@ -1081,6 +1095,11 @@ zfs_znode_free(znode_t *zp)
        list_remove(&zfsvfs->z_all_znodes, zp);
        mutex_exit(&zfsvfs->z_znodes_lock);
 
+       if (zp->z_acl_cached) {
+               zfs_acl_free(zp->z_acl_cached);
+               zp->z_acl_cached = NULL;
+       }
+
        kmem_cache_free(znode_cache, zp);
 
        VFS_RELE(zfsvfs->z_vfs);
index 53d9d9b..db3822f 100644 (file)
@@ -714,14 +714,15 @@ zil_lwb_write_done(zio_t *zio)
        lwb->lwb_buf = NULL;
        if (zio->io_error)
                zilog->zl_log_error = B_TRUE;
-       mutex_exit(&zilog->zl_lock);
 
        /*
         * Now that we've written this log block, we have a stable pointer
         * to the next block in the chain, so it's OK to let the txg in
-        * which we allocated the next block sync.
+        * which we allocated the next block sync. We still have the
+        * zl_lock to ensure zil_sync doesn't kmem free the lwb.
         */
        txg_rele_to_sync(&lwb->lwb_txgh);
+       mutex_exit(&zilog->zl_lock);
 }
 
 /*
@@ -925,6 +926,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                        }
                        error = zilog->zl_get_data(
                            itx->itx_private, lr, dbuf, lwb->lwb_zio);
+                       if (error == EIO) {
+                               txg_wait_synced(zilog->zl_dmu_pool, txg);
+                               return (lwb);
+                       }
                        if (error) {
                                ASSERT(error == ENOENT || error == EEXIST ||
                                    error == EALREADY);