Rebase master to b105

author Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 15 Jan 2009 21:59:39 +0000 (13:59 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 15 Jan 2009 21:59:39 +0000 (13:59 -0800)
author Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 15 Jan 2009 21:59:39 +0000 (13:59 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 15 Jan 2009 21:59:39 +0000 (13:59 -0800)
diff --git a/ZFS.RELEASE b/ZFS.RELEASE

index c00b983..15f12cc 100644 (file)
--- a/ZFS.RELEASE
+++ b/ZFS.RELEASE
@@ -1 +1 @@
-http://dlc.sun.com/osol/on/downloads/b103/on-src.tar.bz2
+http://dlc.sun.com/osol/on/downloads/b105/on-src.tar.bz2
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c

index 253a134..0ced258 100644 (file)
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -87,8 +87,8 @@ static void
  usage(void)
  {
         (void) fprintf(stderr,
-           "Usage: %s [-udibcsv] [-U cachefile_path] "
-           "[-S user:cksumalg] "
+           "Usage: %s [-udibcsvL] [-U cachefile_path] [-t txg]\n"
+           "\t   [-S user:cksumalg] "
             "dataset [object...]\n"
             "       %s -C [pool]\n"
             "       %s -l dev\n"
@@ -108,6 +108,8 @@ usage(void)
             "dump blkptr signatures\n");
         (void) fprintf(stderr, "        -v verbose (applies to all others)\n");
         (void) fprintf(stderr, "        -l dump label contents\n");
+       (void) fprintf(stderr, "        -L disable leak tracking (do not "
+           "load spacemaps)\n");
         (void) fprintf(stderr, "        -U cachefile_path -- use alternate "
             "cachefile\n");
         (void) fprintf(stderr, "        -R read and display block from a "
@@ -115,6 +117,8 @@ usage(void)
         (void) fprintf(stderr, "        -e Pool is exported/destroyed/"
             "has altroot\n");
         (void) fprintf(stderr, "        -p <Path to vdev dir> (use with -e)\n");
+       (void) fprintf(stderr, "        -t <txg> highest txg to use when "
+           "searching for uberblocks\n");
         (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
             "to make only that option verbose\n");
         (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
@@ -517,44 +521,52 @@ dump_metaslabs(spa_t *spa)
  }
  
  static void
+dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
+{
+       char *prefix = (void *)sm;
+
+       (void) printf("%s [%llu,%llu) length %llu\n",
+           prefix,
+           (u_longlong_t)start,
+           (u_longlong_t)(start + size),
+           (u_longlong_t)(size));
+}
+
+static void
  dump_dtl(vdev_t *vd, int indent)
  {
-       avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
-       space_seg_t *ss;
-       vdev_t *pvd;
-       int c;
+       spa_t *spa = vd->vdev_spa;
+       boolean_t required;
+       char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
+       char prefix[256];
+
+       spa_vdev_state_enter(spa);
+       required = vdev_dtl_required(vd);
+       (void) spa_vdev_state_exit(spa, NULL, 0);
  
         if (indent == 0)
                 (void) printf("\nDirty time logs:\n\n");
  
-       (void) printf("\t%*s%s\n", indent, "",
+       (void) printf("\t%*s%s [%s]\n", indent, "",
             vd->vdev_path ? vd->vdev_path :
-           vd->vdev_parent ? vd->vdev_ops->vdev_op_type :
-           spa_name(vd->vdev_spa));
-
-       for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
-               /*
-                * Everything in this DTL must appear in all parent DTL unions.
-                */
-               for (pvd = vd; pvd; pvd = pvd->vdev_parent)
-                       ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
-                           ss->ss_start, ss->ss_end - ss->ss_start));
-               (void) printf("\t%*soutage [%llu,%llu] length %llu\n",
-                   indent, "",
-                   (u_longlong_t)ss->ss_start,
-                   (u_longlong_t)ss->ss_end - 1,
-                   (u_longlong_t)(ss->ss_end - ss->ss_start));
-       }
-
-       (void) printf("\n");
+           vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+           required ? "DTL-required" : "DTL-expendable");
  
-       if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
-               dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
-                   &vd->vdev_dtl_map);
-               (void) printf("\n");
+       for (int t = 0; t < DTL_TYPES; t++) {
+               space_map_t *sm = &vd->vdev_dtl[t];
+               if (sm->sm_space == 0)
+                       continue;
+               (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+                   indent + 2, "", name[t]);
+               mutex_enter(sm->sm_lock);
+               space_map_walk(sm, dump_dtl_seg, (void *)prefix);
+               mutex_exit(sm->sm_lock);
+               if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+                       dump_spacemap(spa->spa_meta_objset,
+                           &vd->vdev_dtl_smo, sm);
         }
  
-       for (c = 0; c < vd->vdev_children; c++)
+       for (int c = 0; c < vd->vdev_children; c++)
                 dump_dtl(vd->vdev_child[c], indent + 4);
  }
  
@@ -668,7 +680,8 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
                                 break;
                         fill += cbp->blk_fill;
                 }
-               ASSERT3U(fill, ==, bp->blk_fill);
+               if (!err)
+                       ASSERT3U(fill, ==, bp->blk_fill);
                 (void) arc_buf_remove_ref(buf, &buf);
         }
  
@@ -1481,8 +1494,9 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
                 }
         }
  
-       VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
-           NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
+       if (!dump_opt['L'])
+               VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
+                   NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
  }
  
  static int
@@ -1557,9 +1571,11 @@ dump_block_stats(spa_t *spa)
         int c, e;
  
         if (!dump_opt['S']) {
-               (void) printf("\nTraversing all blocks to %sverify"
-                   " nothing leaked ...\n",
-                   dump_opt['c'] ? "verify checksums and " : "");
+               (void) printf("\nTraversing all blocks %s%s%s%s...\n",
+                   (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+                   dump_opt['c'] ? "checksums " : "",
+                   (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+                   !dump_opt['L'] ? "nothing leaked " : "");
         }
  
         /*
@@ -1570,7 +1586,8 @@ dump_block_stats(spa_t *spa)
          * it's not part of any space map) is a double allocation,
          * reference to a freed block, or an unclaimed log block.
          */
-       zdb_leak_init(spa);
+       if (!dump_opt['L'])
+               zdb_leak_init(spa);
  
         /*
          * If there's a deferred-free bplist, process that first.
@@ -1612,7 +1629,8 @@ dump_block_stats(spa_t *spa)
         /*
          * Report any leaked segments.
          */
-       zdb_leak_fini(spa);
+       if (!dump_opt['L'])
+               zdb_leak_fini(spa);
  
         /*
          * If we're interested in printing out the blkptr signatures,
@@ -1638,14 +1656,16 @@ dump_block_stats(spa_t *spa)
         tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
  
         if (tzb->zb_asize == alloc + logalloc) {
-               (void) printf("\n\tNo leaks (block sum matches space"
-                   " maps exactly)\n");
+               if (!dump_opt['L'])
+                       (void) printf("\n\tNo leaks (block sum matches space"
+                           " maps exactly)\n");
         } else {
                 (void) printf("block traversal size %llu != alloc %llu "
-                   "(leaked %lld)\n",
+                   "(%s %lld)\n",
                     (u_longlong_t)tzb->zb_asize,
                     (u_longlong_t)alloc + logalloc,
-                   (u_longlong_t)(alloc + logalloc - tzb->zb_asize));
+                   (dump_opt['L']) ? "unreachable" : "leaked",
+                   (longlong_t)(alloc + logalloc - tzb->zb_asize));
                 leaks = 1;
         }
  
@@ -2235,7 +2255,7 @@ main(int argc, char **argv)
  
         dprintf_setup(&argc, argv);
  
-       while ((c = getopt(argc, argv, "udibcsvCS:U:lRep:")) != -1) {
+       while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) {
                 switch (c) {
                 case 'u':
                 case 'd':
@@ -2249,6 +2269,9 @@ main(int argc, char **argv)
                         dump_opt[c]++;
                         dump_all = 0;
                         break;
+               case 'L':
+                       dump_opt[c]++;
+                       break;
                 case 'v':
                         verbose++;
                         break;
@@ -2279,6 +2302,14 @@ main(int argc, char **argv)
                         else
                                 usage();
                         break;
+               case 't':
+                       ub_max_txg = strtoull(optarg, NULL, 0);
+                       if (ub_max_txg < TXG_INITIAL) {
+                               (void) fprintf(stderr, "incorrect txg "
+                                   "specified: %s\n", optarg);
+                               usage();
+                       }
+                       break;
                 default:
                         usage();
                         break;
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c

index f9480ce..161a153 100644 (file)
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -370,18 +370,12 @@ usage(boolean_t requested)
  
                 zfs_deleg_permissions();
         } else {
-               /*
-                * TRANSLATION NOTE:
-                * "zfs set|get" must not be localised this is the
-                * command name and arguments.
-                */
-
                 (void) fprintf(fp,
-                   gettext("\nFor the property list, run: zfs set|get\n"));
-
+                   gettext("\nFor the property list, run: %s\n"),
+                   "zfs set|get");
                 (void) fprintf(fp,
-                   gettext("\nFor the delegated permission list, run:"
-                   " zfs allow|unallow\n"));
+                   gettext("\nFor the delegated permission list, run: %s\n"),
+                   "zfs allow|unallow");
         }
  
         /*
@@ -419,7 +413,6 @@ parseprop(nvlist_t *props)
                 return (-1);
         }
         return (0);
-
  }
  
  /*
@@ -2584,14 +2577,15 @@ zfs_print_allows(char *ds)
         for (curperms = perms; curperms; curperms = curperms->z_next) {
  
                 (void) snprintf(banner, sizeof (banner),
-                   "Permission sets on (%s)", curperms->z_setpoint);
+                   gettext("Permission sets on (%s)"), curperms->z_setpoint);
                 allowcb.a_treeoffset =
                     offsetof(zfs_allow_node_t, z_localdescend);
                 allowcb.a_permcnt = 0;
                 zfs_iter_perms(&curperms->z_sets, banner, &allowcb);
  
                 (void) snprintf(banner, sizeof (banner),
-                   "Create time permissions on (%s)", curperms->z_setpoint);
+                   gettext("Create time permissions on (%s)"),
+                   curperms->z_setpoint);
                 allowcb.a_treeoffset =
                     offsetof(zfs_allow_node_t, z_localdescend);
                 allowcb.a_permcnt = 0;
@@ -2599,7 +2593,7 @@ zfs_print_allows(char *ds)
  
  
                 (void) snprintf(banner, sizeof (banner),
-                   "Local permissions on (%s)", curperms->z_setpoint);
+                   gettext("Local permissions on (%s)"), curperms->z_setpoint);
                 allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local);
                 allowcb.a_permcnt = 0;
                 zfs_iter_perms(&curperms->z_user, banner, &allowcb);
@@ -2607,7 +2601,8 @@ zfs_print_allows(char *ds)
                 zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
  
                 (void) snprintf(banner, sizeof (banner),
-                   "Descendent permissions on (%s)", curperms->z_setpoint);
+                   gettext("Descendent permissions on (%s)"),
+                   curperms->z_setpoint);
                 allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend);
                 allowcb.a_permcnt = 0;
                 zfs_iter_perms(&curperms->z_user, banner, &allowcb);
@@ -2615,7 +2610,7 @@ zfs_print_allows(char *ds)
                 zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
  
                 (void) snprintf(banner, sizeof (banner),
-                   "Local+Descendent permissions on (%s)",
+                   gettext("Local+Descendent permissions on (%s)"),
                     curperms->z_setpoint);
                 allowcb.a_treeoffset =
                     offsetof(zfs_allow_node_t, z_localdescend);
@@ -3071,7 +3066,6 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
                     sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
                 verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
                     sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
-               canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
  
                 if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
                     strcmp(smbshareopts, "off") == 0) {
@@ -3081,7 +3075,8 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
                         (void) fprintf(stderr, gettext("cannot share '%s': "
                             "legacy share\n"), zfs_get_name(zhp));
                         (void) fprintf(stderr, gettext("use share(1M) to "
-                           "share this filesystem\n"));
+                           "share this filesystem, or set "
+                           "sharenfs property on\n"));
                         return (1);
                 }
  
@@ -3119,6 +3114,7 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
                  * noauto       no              return 0
                  * noauto       yes             pass through
                  */
+               canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
                 if (canmount == ZFS_CANMOUNT_OFF) {
                         if (!explicit)
                                 return (0);
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c

index 54bba86..1bca45c 100644 (file)
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -877,17 +877,21 @@ int
  zpool_do_export(int argc, char **argv)
  {
         boolean_t force = B_FALSE;
+       boolean_t hardforce = B_FALSE;
         int c;
         zpool_handle_t *zhp;
         int ret;
         int i;
  
         /* check options */
-       while ((c = getopt(argc, argv, "f")) != -1) {
+       while ((c = getopt(argc, argv, "fF")) != -1) {
                 switch (c) {
                 case 'f':
                         force = B_TRUE;
                         break;
+               case 'F':
+                       hardforce = B_TRUE;
+                       break;
                 case '?':
                         (void) fprintf(stderr, gettext("invalid option '%c'\n"),
                             optopt);
@@ -917,8 +921,12 @@ zpool_do_export(int argc, char **argv)
                         continue;
                 }
  
-               if (zpool_export(zhp, force) != 0)
+               if (hardforce) {
+                       if (zpool_export_force(zhp) != 0)
+                               ret = 1;
+               } else if (zpool_export(zhp, force) != 0) {
                         ret = 1;
+               }
  
                 zpool_close(zhp);
         }
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c

index 53cc6c7..4503a3d 100644 (file)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -419,10 +419,10 @@ ztest_random(uint64_t range)
         return (r % range);
  }
  
+/* ARGSUSED */
  static void
  ztest_record_enospc(char *s)
  {
-       dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
         ztest_shared->zs_enospc_count++;
  }
  
@@ -698,15 +698,9 @@ ztest_random_compress(void)
         return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
  }
  
-typedef struct ztest_replay {
-       objset_t        *zr_os;
-       uint64_t        zr_assign;
-} ztest_replay_t;
-
  static int
-ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
+ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
  {
-       objset_t *os = zr->zr_os;
         dmu_tx_t *tx;
         int error;
  
@@ -715,7 +709,7 @@ ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
  
         tx = dmu_tx_create(os);
         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-       error = dmu_tx_assign(tx, zr->zr_assign);
+       error = dmu_tx_assign(tx, TXG_WAIT);
         if (error) {
                 dmu_tx_abort(tx);
                 return (error);
@@ -732,16 +726,15 @@ ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
                 (void) printf("replay create of %s object %llu"
                     " in txg %llu = %d\n",
                     osname, (u_longlong_t)lr->lr_doid,
-                   (u_longlong_t)zr->zr_assign, error);
+                   (u_longlong_t)dmu_tx_get_txg(tx), error);
         }
  
         return (error);
  }
  
  static int
-ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
+ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
  {
-       objset_t *os = zr->zr_os;
         dmu_tx_t *tx;
         int error;
  
@@ -750,7 +743,7 @@ ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
  
         tx = dmu_tx_create(os);
         dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
-       error = dmu_tx_assign(tx, zr->zr_assign);
+       error = dmu_tx_assign(tx, TXG_WAIT);
         if (error) {
                 dmu_tx_abort(tx);
                 return (error);
@@ -977,7 +970,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
         uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
         uint64_t leaf, top;
         uint64_t ashift = ztest_get_ashift();
-       uint64_t oldguid;
+       uint64_t oldguid, pguid;
         size_t oldsize, newsize;
         char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
         int replacing;
@@ -1009,10 +1002,16 @@ ztest_vdev_attach_detach(ztest_args_t *za)
          * Locate this vdev.
          */
         oldvd = rvd->vdev_child[top];
-       if (zopt_mirrors >= 1)
+       if (zopt_mirrors >= 1) {
+               ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
+               ASSERT(oldvd->vdev_children >= zopt_mirrors);
                 oldvd = oldvd->vdev_child[leaf / zopt_raidz];
-       if (zopt_raidz > 1)
+       }
+       if (zopt_raidz > 1) {
+               ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+               ASSERT(oldvd->vdev_children == zopt_raidz);
                 oldvd = oldvd->vdev_child[leaf % zopt_raidz];
+       }
  
         /*
          * If we're already doing an attach or replace, oldvd may be a
@@ -1020,8 +1019,8 @@ ztest_vdev_attach_detach(ztest_args_t *za)
          */
         while (oldvd->vdev_children != 0) {
                 oldvd_has_siblings = B_TRUE;
-               ASSERT(oldvd->vdev_children == 2);
-               oldvd = oldvd->vdev_child[ztest_random(2)];
+               ASSERT(oldvd->vdev_children >= 2);
+               oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
         }
  
         oldguid = oldvd->vdev_guid;
@@ -1029,16 +1028,17 @@ ztest_vdev_attach_detach(ztest_args_t *za)
         oldvd_is_log = oldvd->vdev_top->vdev_islog;
         (void) strcpy(oldpath, oldvd->vdev_path);
         pvd = oldvd->vdev_parent;
+       pguid = pvd->vdev_guid;
  
         /*
          * If oldvd has siblings, then half of the time, detach it.
          */
         if (oldvd_has_siblings && ztest_random(2) == 0) {
                 spa_config_exit(spa, SCL_VDEV, FTAG);
-               error = spa_vdev_detach(spa, oldguid, B_FALSE);
-               if (error != 0 && error != ENODEV && error != EBUSY)
-                       fatal(0, "detach (%s) returned %d",
-                           oldpath, error);
+               error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
+               if (error != 0 && error != ENODEV && error != EBUSY &&
+                   error != ENOTSUP)
+                       fatal(0, "detach (%s) returned %d", oldpath, error);
                 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
                 return;
         }
@@ -1138,7 +1138,6 @@ ztest_vdev_attach_detach(ztest_args_t *za)
  /*
   * Verify that dynamic LUN growth works as expected.
   */
-/* ARGSUSED */
  void
  ztest_vdev_LUN_growth(ztest_args_t *za)
  {
@@ -1278,7 +1277,6 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
         zilog_t *zilog;
         uint64_t seq;
         uint64_t objects;
-       ztest_replay_t zr;
  
         (void) rw_rdlock(&ztest_shared->zs_name_lock);
         (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
@@ -1295,8 +1293,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
          */
         if (ztest_random(2) == 0 &&
             dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
-               zr.zr_os = os;
-               zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
+               zil_replay(os, os, ztest_replay_vector);
                 dmu_objset_close(os);
         }
  
@@ -2059,8 +2056,6 @@ ztest_dmu_write_parallel(ztest_args_t *za)
         error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
         za->za_dbuf = db;
         if (error) {
-               dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
-                   osname, ZTEST_DIROBJ, blkoff, error);
                 (void) mutex_unlock(lp);
                 return;
         }
@@ -2071,11 +2066,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
  
         (void) mutex_unlock(lp);
  
-       if (error) {
-               dprintf("dmu_sync(%s, %d, %llx) = %d\n",
-                   osname, ZTEST_DIROBJ, off, error);
+       if (error)
                 return;
-       }
  
         if (blk.blk_birth == 0)         /* concurrent free */
                 return;
@@ -2584,8 +2576,6 @@ ztest_fault_inject(ztest_args_t *za)
                 maxfaults = INT_MAX;    /* no limit on cache devices */
         }
  
-       dprintf("damaging %s and %s\n", path0, pathrand);
-
         spa_config_exit(spa, SCL_STATE, FTAG);
  
         if (maxfaults == 0)
@@ -2595,10 +2585,13 @@ ztest_fault_inject(ztest_args_t *za)
          * If we can tolerate two or more faults, randomly online/offline vd0.
          */
         if (maxfaults >= 2 && guid0 != 0) {
-               if (ztest_random(10) < 6)
-                       (void) vdev_offline(spa, guid0, B_TRUE);
-               else
-                       (void) vdev_online(spa, guid0, B_FALSE, NULL);
+               if (ztest_random(10) < 6) {
+                       int flags = (ztest_random(2) == 0 ?
+                           ZFS_OFFLINE_TEMPORARY : 0);
+                       VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+               } else {
+                       (void) vdev_online(spa, guid0, 0, NULL);
+               }
         }
  
         /*
@@ -2853,7 +2846,7 @@ ztest_walk_pool_directory(char *header)
  static void
  ztest_spa_import_export(char *oldname, char *newname)
  {
-       nvlist_t *config;
+       nvlist_t *config, *newconfig;
         uint64_t pool_guid;
         spa_t *spa;
         int error;
@@ -2875,6 +2868,12 @@ ztest_spa_import_export(char *oldname, char *newname)
         if (error)
                 fatal(0, "spa_open('%s') = %d", oldname, error);
  
+       /*
+        * Kick off a scrub to tickle scrub/export races.
+        */
+       if (ztest_random(2) == 0)
+               (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+
         pool_guid = spa_guid(spa);
         spa_close(spa, FTAG);
  
@@ -2883,13 +2882,20 @@ ztest_spa_import_export(char *oldname, char *newname)
         /*
          * Export it.
          */
-       error = spa_export(oldname, &config, B_FALSE);
+       error = spa_export(oldname, &config, B_FALSE, B_FALSE);
         if (error)
                 fatal(0, "spa_export('%s') = %d", oldname, error);
  
         ztest_walk_pool_directory("pools after export");
  
         /*
+        * Try to import it.
+        */
+       newconfig = spa_tryimport(config);
+       ASSERT(newconfig != NULL);
+       nvlist_free(newconfig);
+
+       /*
          * Import it under the new name.
          */
         error = spa_import(newname, config, NULL);
@@ -2931,22 +2937,25 @@ ztest_spa_import_export(char *oldname, char *newname)
         nvlist_free(config);
  }
  
+static void
+ztest_resume(spa_t *spa)
+{
+       if (spa_suspended(spa)) {
+               spa_vdev_state_enter(spa);
+               vdev_clear(spa, NULL);
+               (void) spa_vdev_state_exit(spa, NULL, 0);
+               zio_resume(spa);
+       }
+}
+
  static void *
-ztest_resume(void *arg)
+ztest_resume_thread(void *arg)
  {
         spa_t *spa = arg;
  
         while (!ztest_exiting) {
                 (void) poll(NULL, 0, 1000);
-
-               if (!spa_suspended(spa))
-                       continue;
-
-               spa_vdev_state_enter(spa);
-               vdev_clear(spa, NULL);
-               (void) spa_vdev_state_exit(spa, NULL, 0);
-
-               zio_resume(spa);
+               ztest_resume(spa);
         }
         return (NULL);
  }
@@ -3089,9 +3098,19 @@ ztest_run(char *pool)
         VERIFY(spa_open(pool, &spa, FTAG) == 0);
  
         /*
+        * We don't expect the pool to suspend unless maxfaults == 0,
+        * in which case ztest_fault_inject() temporarily takes away
+        * the only valid replica.
+        */
+       if (zopt_maxfaults == 0)
+               spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
+       else
+               spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
+
+       /*
          * Create a thread to periodically resume suspended I/O.
          */
-       VERIFY(thr_create(0, 0, ztest_resume, spa, THR_BOUND,
+       VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
             &resume_tid) == 0);
  
         /*
@@ -3140,7 +3159,6 @@ ztest_run(char *pool)
                 za[t].za_kill = za[0].za_kill;
  
                 if (t < zopt_datasets) {
-                       ztest_replay_t zr;
                         int test_future = FALSE;
                         (void) rw_rdlock(&ztest_shared->zs_name_lock);
                         (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
@@ -3164,9 +3182,8 @@ ztest_run(char *pool)
                         (void) rw_unlock(&ztest_shared->zs_name_lock);
                         if (test_future)
                                 ztest_dmu_check_future_leak(&za[t]);
-                       zr.zr_os = za[d].za_os;
-                       zil_replay(zr.zr_os, &zr, &zr.zr_assign,
-                           ztest_replay_vector, NULL);
+                       zil_replay(za[d].za_os, za[d].za_os,
+                           ztest_replay_vector);
                         za[d].za_zilog = zil_open(za[d].za_os, NULL);
                 }
  
@@ -3211,6 +3228,7 @@ ztest_run(char *pool)
         /* Kill the resume thread */
         ztest_exiting = B_TRUE;
         VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+       ztest_resume(spa);
  
         /*
          * Right before closing the pool, kick off a bunch of async I/O;
@@ -3306,11 +3324,6 @@ main(int argc, char **argv)
  
         process_options(argc, argv);
  
-       argc -= optind;
-       argv += optind;
-
-       dprintf_setup(&argc, argv);
-
         /*
          * Blow away any existing copy of zpool.cache
          */
diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h

index c650865..511ee79 100644 (file)
--- a/lib/libzfs/include/libzfs.h
+++ b/lib/libzfs/include/libzfs.h
@@ -29,6 +29,7 @@
  
  #include <assert.h>
  #include <libnvpair.h>
+#include <sys/mnttab.h>
  #include <sys/param.h>
  #include <sys/types.h>
  #include <sys/varargs.h>
@@ -175,6 +176,13 @@ extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
  extern int libzfs_errno(libzfs_handle_t *);
  extern const char *libzfs_error_action(libzfs_handle_t *);
  extern const char *libzfs_error_description(libzfs_handle_t *);
+extern void libzfs_mnttab_init(libzfs_handle_t *);
+extern void libzfs_mnttab_fini(libzfs_handle_t *);
+extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
+    struct mnttab *);
+extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
+    const char *, const char *);
+extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *);
  
  /*
   * Basic handle functions
@@ -289,6 +297,7 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
   * Import and export functions
   */
  extern int zpool_export(zpool_handle_t *, boolean_t);
+extern int zpool_export_force(zpool_handle_t *);
  extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
      char *altroot);
  extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
diff --git a/lib/libzfs/include/libzfs_impl.h b/lib/libzfs/include/libzfs_impl.h

index 9f1f66d..073499b 100644 (file)
--- a/lib/libzfs/include/libzfs_impl.h
+++ b/lib/libzfs/include/libzfs_impl.h
@@ -63,6 +63,7 @@ struct libzfs_handle {
         int libzfs_printerr;
         void *libzfs_sharehdl; /* libshare handle */
         uint_t libzfs_shareflags;
+       avl_tree_t libzfs_mnttab_cache;
  };
  #define        ZFSSHARE_MISS   0x01    /* Didn't find entry in cache */
  
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c

index a8005ff..a381a0e 100644 (file)
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -38,7 +38,6 @@
  #include <zone.h>
  #include <fcntl.h>
  #include <sys/mntent.h>
-#include <sys/mnttab.h>
  #include <sys/mount.h>
  #include <sys/avl.h>
  #include <priv.h>
@@ -108,7 +107,6 @@ path_to_str(const char *path, int types)
                 return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
         }
  
-
         /*
          * The user has requested either filesystems or volumes.
          * We have no way of knowing a priori what type this would be, so always
@@ -319,38 +317,35 @@ zpool_free_handles(libzfs_handle_t *hdl)
   * Utility function to gather stats (objset and zpl) for the given object.
   */
  static int
-get_stats(zfs_handle_t *zhp)
+get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
  {
-       zfs_cmd_t zc = { 0 };
         libzfs_handle_t *hdl = zhp->zfs_hdl;
-       nvlist_t *allprops, *userprops;
-
-       (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
  
-       if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
-               return (-1);
+       (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
  
-       while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
+       while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
                 if (errno == ENOMEM) {
-                       if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
-                               zcmd_free_nvlists(&zc);
+                       if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
                                 return (-1);
                         }
                 } else {
-                       zcmd_free_nvlists(&zc);
                         return (-1);
                 }
         }
+       return (0);
+}
  
-       zhp->zfs_dmustats = zc.zc_objset_stats; /* structure assignment */
+static int
+put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
+{
+       nvlist_t *allprops, *userprops;
  
-       if (zcmd_read_dst_nvlist(hdl, &zc, &allprops) != 0) {
-               zcmd_free_nvlists(&zc);
+       zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
+
+       if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
                 return (-1);
         }
  
-       zcmd_free_nvlists(&zc);
-
         if ((userprops = process_user_props(zhp, allprops)) == NULL) {
                 nvlist_free(allprops);
                 return (-1);
@@ -365,6 +360,22 @@ get_stats(zfs_handle_t *zhp)
         return (0);
  }
  
+static int
+get_stats(zfs_handle_t *zhp)
+{
+       int rc = 0;
+       zfs_cmd_t zc = { 0 };
+
+       if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+               return (-1);
+       if (get_stats_ioctl(zhp, &zc) != 0)
+               rc = -1;
+       else if (put_stats_zhdl(zhp, &zc) != 0)
+               rc = -1;
+       zcmd_free_nvlists(&zc);
+       return (rc);
+}
+
  /*
   * Refresh the properties currently stored in the handle.
   */
@@ -378,16 +389,11 @@ zfs_refresh_properties(zfs_handle_t *zhp)
   * Makes a handle from the given dataset name.  Used by zfs_open() and
   * zfs_iter_* to create child handles on the fly.
   */
-zfs_handle_t *
-make_dataset_handle(libzfs_handle_t *hdl, const char *path)
+static int
+make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
  {
-       zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
         char *logstr;
-
-       if (zhp == NULL)
-               return (NULL);
-
-       zhp->zfs_hdl = hdl;
+       libzfs_handle_t *hdl = zhp->zfs_hdl;
  
         /*
          * Preserve history log string.
@@ -396,17 +402,16 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path)
          */
         logstr = zhp->zfs_hdl->libzfs_log_str;
         zhp->zfs_hdl->libzfs_log_str = NULL;
-top:
-       (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
  
-       if (get_stats(zhp) != 0) {
+top:
+       if (put_stats_zhdl(zhp, zc) != 0) {
                 zhp->zfs_hdl->libzfs_log_str = logstr;
-               free(zhp);
-               return (NULL);
+               return (-1);
         }
  
+
         if (zhp->zfs_dmustats.dds_inconsistent) {
-               zfs_cmd_t zc = { 0 };
+               zfs_cmd_t zc2 = { 0 };
  
                 /*
                  * If it is dds_inconsistent, then we've caught it in
@@ -423,28 +428,33 @@ top:
                  * will fail with EBUSY and we will drive on as usual.
                  */
  
-               (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+               (void) strlcpy(zc2.zc_name, zhp->zfs_name,
+                   sizeof (zc2.zc_name));
  
                 if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
                         (void) zvol_remove_link(hdl, zhp->zfs_name);
-                       zc.zc_objset_type = DMU_OST_ZVOL;
+                       zc2.zc_objset_type = DMU_OST_ZVOL;
                 } else {
-                       zc.zc_objset_type = DMU_OST_ZFS;
+                       zc2.zc_objset_type = DMU_OST_ZFS;
                 }
  
                 /*
                  * If we can successfully destroy it, pretend that it
                  * never existed.
                  */
-               if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) == 0) {
+               if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc2) == 0) {
                         zhp->zfs_hdl->libzfs_log_str = logstr;
-                       free(zhp);
                         errno = ENOENT;
-                       return (NULL);
+                       return (-1);
                 }
-               /* If we can successfully roll it back, reget the stats */
-               if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc) == 0)
+               /* If we can successfully roll it back, reset the stats */
+               if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc2) == 0) {
+                       if (get_stats_ioctl(zhp, zc) != 0) {
+                               zhp->zfs_hdl->libzfs_log_str = logstr;
+                               return (-1);
+                       }
                         goto top;
+               }
         }
  
         /*
@@ -469,6 +479,52 @@ top:
  
         zhp->zfs_hdl->libzfs_log_str = logstr;
         zhp->zpool_hdl = zpool_handle(zhp);
+       return (0);
+}
+
+zfs_handle_t *
+make_dataset_handle(libzfs_handle_t *hdl, const char *path)
+{
+       zfs_cmd_t zc = { 0 };
+
+       zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+       if (zhp == NULL)
+               return (NULL);
+
+       zhp->zfs_hdl = hdl;
+       (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
+               free(zhp);
+               return (NULL);
+       }
+       if (get_stats_ioctl(zhp, &zc) == -1) {
+               zcmd_free_nvlists(&zc);
+               free(zhp);
+               return (NULL);
+       }
+       if (make_dataset_handle_common(zhp, &zc) == -1) {
+               free(zhp);
+               zhp = NULL;
+       }
+       zcmd_free_nvlists(&zc);
+       return (zhp);
+}
+
+static zfs_handle_t *
+make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
+{
+       zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+       if (zhp == NULL)
+               return (NULL);
+
+       zhp->zfs_hdl = hdl;
+       (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
+       if (make_dataset_handle_common(zhp, zc) == -1) {
+               free(zhp);
+               return (NULL);
+       }
         return (zhp);
  }
  
@@ -527,6 +583,117 @@ zfs_close(zfs_handle_t *zhp)
         free(zhp);
  }
  
+typedef struct mnttab_node {
+       struct mnttab mtn_mt;
+       avl_node_t mtn_node;
+} mnttab_node_t;
+
+static int
+libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
+{
+       const mnttab_node_t *mtn1 = arg1;
+       const mnttab_node_t *mtn2 = arg2;
+       int rv;
+
+       rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
+
+       if (rv == 0)
+               return (0);
+       return (rv > 0 ? 1 : -1);
+}
+
+void
+libzfs_mnttab_init(libzfs_handle_t *hdl)
+{
+       struct mnttab entry;
+
+       assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
+       avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
+           sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
+
+       rewind(hdl->libzfs_mnttab);
+       while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
+               mnttab_node_t *mtn;
+
+               if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
+                       continue;
+               mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
+               mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
+               mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
+               mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
+               mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
+               avl_add(&hdl->libzfs_mnttab_cache, mtn);
+       }
+}
+
+void
+libzfs_mnttab_fini(libzfs_handle_t *hdl)
+{
+       void *cookie = NULL;
+       mnttab_node_t *mtn;
+
+       while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) {
+               free(mtn->mtn_mt.mnt_special);
+               free(mtn->mtn_mt.mnt_mountp);
+               free(mtn->mtn_mt.mnt_fstype);
+               free(mtn->mtn_mt.mnt_mntopts);
+               free(mtn);
+       }
+       avl_destroy(&hdl->libzfs_mnttab_cache);
+}
+
+int
+libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
+    struct mnttab *entry)
+{
+       mnttab_node_t find;
+       mnttab_node_t *mtn;
+
+       if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
+               libzfs_mnttab_init(hdl);
+
+       find.mtn_mt.mnt_special = (char *)fsname;
+       mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
+       if (mtn) {
+               *entry = mtn->mtn_mt;
+               return (0);
+       }
+       return (ENOENT);
+}
+
+void
+libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
+    const char *mountp, const char *mntopts)
+{
+       mnttab_node_t *mtn;
+
+       if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
+               return;
+       mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
+       mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
+       mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
+       mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
+       mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
+       avl_add(&hdl->libzfs_mnttab_cache, mtn);
+}
+
+void
+libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
+{
+       mnttab_node_t find;
+       mnttab_node_t *ret;
+
+       find.mtn_mt.mnt_special = (char *)fsname;
+       if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) {
+               avl_remove(&hdl->libzfs_mnttab_cache, ret);
+               free(ret->mtn_mt.mnt_special);
+               free(ret->mtn_mt.mnt_mountp);
+               free(ret->mtn_mt.mnt_fstype);
+               free(ret->mtn_mt.mnt_mntopts);
+               free(ret);
+       }
+}
+
  int
  zfs_spa_version(zfs_handle_t *zhp, int *spa_version)
  {
@@ -2123,15 +2290,11 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
          */
         if (!zhp->zfs_mntcheck &&
             (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
-               struct mnttab entry, search = { 0 };
-               FILE *mnttab = zhp->zfs_hdl->libzfs_mnttab;
+               libzfs_handle_t *hdl = zhp->zfs_hdl;
+               struct mnttab entry;
  
-               search.mnt_special = (char *)zhp->zfs_name;
-               search.mnt_fstype = MNTTYPE_ZFS;
-               rewind(mnttab);
-
-               if (getmntany(mnttab, &entry, &search) == 0) {
-                       zhp->zfs_mntopts = zfs_strdup(zhp->zfs_hdl,
+               if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
+                       zhp->zfs_mntopts = zfs_strdup(hdl,
                             entry.mnt_mntopts);
                         if (zhp->zfs_mntopts == NULL)
                                 return (-1);
@@ -2575,6 +2738,46 @@ zfs_get_type(const zfs_handle_t *zhp)
         return (zhp->zfs_type);
  }
  
+static int
+zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc)
+{
+       int rc;
+       uint64_t        orig_cookie;
+
+       orig_cookie = zc->zc_cookie;
+top:
+       (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
+       rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
+
+       if (rc == -1) {
+               switch (errno) {
+               case ENOMEM:
+                       /* expand nvlist memory and try again */
+                       if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) {
+                               zcmd_free_nvlists(zc);
+                               return (-1);
+                       }
+                       zc->zc_cookie = orig_cookie;
+                       goto top;
+               /*
+                * An errno value of ESRCH indicates normal completion.
+                * If ENOENT is returned, then the underlying dataset
+                * has been removed since we obtained the handle.
+                */
+               case ESRCH:
+               case ENOENT:
+                       rc = 1;
+                       break;
+               default:
+                       rc = zfs_standard_error(zhp->zfs_hdl, errno,
+                           dgettext(TEXT_DOMAIN,
+                           "cannot iterate filesystems"));
+                       break;
+               }
+       }
+       return (rc);
+}
+
  /*
   * Iterate over all child filesystems
   */
@@ -2588,9 +2791,11 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
         if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM)
                 return (0);
  
-       for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-           ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
-           (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
+       if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+               return (-1);
+
+       while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
+           &zc)) == 0) {
                 /*
                  * Ignore private dataset names.
                  */
@@ -2601,24 +2806,18 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
                  * Silently ignore errors, as the only plausible explanation is
                  * that the pool has since been removed.
                  */
-               if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
-                   zc.zc_name)) == NULL)
+               if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
+                   &zc)) == NULL) {
                         continue;
+               }
  
-               if ((ret = func(nzhp, data)) != 0)
+               if ((ret = func(nzhp, data)) != 0) {
+                       zcmd_free_nvlists(&zc);
                         return (ret);
+               }
         }
-
-       /*
-        * An errno value of ESRCH indicates normal completion.  If ENOENT is
-        * returned, then the underlying dataset has been removed since we
-        * obtained the handle.
-        */
-       if (errno != ESRCH && errno != ENOENT)
-               return (zfs_standard_error(zhp->zfs_hdl, errno,
-                   dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
-
-       return (0);
+       zcmd_free_nvlists(&zc);
+       return ((ret < 0) ? ret : 0);
  }
  
  /*
@@ -2634,29 +2833,23 @@ zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
         if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
                 return (0);
  
-       for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-           ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
-           &zc) == 0;
-           (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
+       if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+               return (-1);
+       while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
+           &zc)) == 0) {
  
-               if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
-                   zc.zc_name)) == NULL)
+               if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
+                   &zc)) == NULL) {
                         continue;
+               }
  
-               if ((ret = func(nzhp, data)) != 0)
+               if ((ret = func(nzhp, data)) != 0) {
+                       zcmd_free_nvlists(&zc);
                         return (ret);
+               }
         }
-
-       /*
-        * An errno value of ESRCH indicates normal completion.  If ENOENT is
-        * returned, then the underlying dataset has been removed since we
-        * obtained the handle.  Silently ignore this case, and return success.
-        */
-       if (errno != ESRCH && errno != ENOENT)
-               return (zfs_standard_error(zhp->zfs_hdl, errno,
-                   dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
-
-       return (0);
+       zcmd_free_nvlists(&zc);
+       return ((ret < 0) ? ret : 0);
  }
  
  /*
@@ -2709,8 +2902,8 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
         zfs_handle_t *zhp;
         char errbuf[1024];
  
-       (void) snprintf(errbuf, sizeof (errbuf), "cannot create '%s'",
-           path);
+       (void) snprintf(errbuf, sizeof (errbuf),
+           dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
  
         /* get parent, and check to see if this is just a pool */
         if (parent_name(path, parent, sizeof (parent)) != 0) {
diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c

index 7c5c7f3..0668dea 100644 (file)
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -74,7 +74,6 @@
  #include <unistd.h>
  #include <zone.h>
  #include <sys/mntent.h>
-#include <sys/mnttab.h>
  #include <sys/mount.h>
  #include <sys/stat.h>
  
@@ -236,18 +235,9 @@ dir_is_empty(const char *dirname)
  boolean_t
  is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where)
  {
-       struct mnttab search = { 0 }, entry;
-
-       /*
-        * Search for the entry in /etc/mnttab.  We don't bother getting the
-        * mountpoint, as we can just search for the special device.  This will
-        * also let us find mounts when the mountpoint is 'legacy'.
-        */
-       search.mnt_special = (char *)special;
-       search.mnt_fstype = MNTTYPE_ZFS;
+       struct mnttab entry;
  
-       rewind(zfs_hdl->libzfs_mnttab);
-       if (getmntany(zfs_hdl->libzfs_mnttab, &entry, &search) != 0)
+       if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0)
                 return (B_FALSE);
  
         if (where != NULL)
@@ -358,12 +348,14 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
                 } else {
                         zfs_error_aux(hdl, strerror(errno));
                 }
-
                 return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
                     dgettext(TEXT_DOMAIN, "cannot mount '%s'"),
                     zhp->zfs_name));
         }
  
+       /* add the mounted entry into our cache */
+       libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint,
+           mntopts);
         return (0);
  }
  
@@ -389,26 +381,23 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
  int
  zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
  {
-       struct mnttab search = { 0 }, entry;
+       libzfs_handle_t *hdl = zhp->zfs_hdl;
+       struct mnttab entry;
         char *mntpt = NULL;
  
-       /* check to see if need to unmount the filesystem */
-       search.mnt_special = zhp->zfs_name;
-       search.mnt_fstype = MNTTYPE_ZFS;
-       rewind(zhp->zfs_hdl->libzfs_mnttab);
+       /* check to see if we need to unmount the filesystem */
         if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
-           getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) {
-
+           libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) {
                 /*
                  * mountpoint may have come from a call to
                  * getmnt/getmntany if it isn't NULL. If it is NULL,
-                * we know it comes from getmntany which can then get
-                * overwritten later. We strdup it to play it safe.
+                * we know it comes from libzfs_mnttab_find which can
+                * then get freed later. We strdup it to play it safe.
                  */
                 if (mountpoint == NULL)
-                       mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp);
+                       mntpt = zfs_strdup(hdl, entry.mnt_mountp);
                 else
-                       mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint);
+                       mntpt = zfs_strdup(hdl, mountpoint);
  
                 /*
                  * Unshare and unmount the filesystem
@@ -416,11 +405,12 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
                 if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0)
                         return (-1);
  
-               if (unmount_one(zhp->zfs_hdl, mntpt, flags) != 0) {
+               if (unmount_one(hdl, mntpt, flags) != 0) {
                         free(mntpt);
                         (void) zfs_shareall(zhp);
                         return (-1);
                 }
+               libzfs_mnttab_remove(hdl, zhp->zfs_name);
                 free(mntpt);
         }
  
@@ -849,7 +839,7 @@ unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint,
         char *mntpt;
         /*
          * Mountpoint could get trashed if libshare calls getmntany
-        * which id does during API initialization, so strdup the
+        * which it does during API initialization, so strdup the
          * value.
          */
         mntpt = zfs_strdup(hdl, mountpoint);
@@ -887,18 +877,17 @@ int
  zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint,
      zfs_share_proto_t *proto)
  {
-       struct mnttab search = { 0 }, entry;
+       libzfs_handle_t *hdl = zhp->zfs_hdl;
+       struct mnttab entry;
         char *mntpt = NULL;
  
         /* check to see if need to unmount the filesystem */
-       search.mnt_special = (char *)zfs_get_name(zhp);
-       search.mnt_fstype = MNTTYPE_ZFS;
         rewind(zhp->zfs_hdl->libzfs_mnttab);
         if (mountpoint != NULL)
-               mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint);
+               mountpoint = mntpt = zfs_strdup(hdl, mountpoint);
  
         if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
-           getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) {
+           libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) {
                 zfs_share_proto_t *curr_proto;
  
                 if (mountpoint == NULL)
@@ -907,8 +896,8 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint,
                 for (curr_proto = proto; *curr_proto != PROTO_END;
                     curr_proto++) {
  
-                       if (is_shared(zhp->zfs_hdl, mntpt, *curr_proto) &&
-                           unshare_one(zhp->zfs_hdl, zhp->zfs_name,
+                       if (is_shared(hdl, mntpt, *curr_proto) &&
+                           unshare_one(hdl, zhp->zfs_name,
                             mntpt, *curr_proto) != 0) {
                                 if (mntpt != NULL)
                                         free(mntpt);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c

index dc5407b..9cb5d76 100644 (file)
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -1127,7 +1127,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
   * mounted datasets in the pool.
   */
  int
-zpool_export(zpool_handle_t *zhp, boolean_t force)
+zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
  {
         zfs_cmd_t zc = { 0 };
         char msg[1024];
@@ -1140,6 +1140,7 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
  
         (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
         zc.zc_cookie = force;
+       zc.zc_guid = hardforce;
  
         if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
                 switch (errno) {
@@ -1160,6 +1161,18 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
         return (0);
  }
  
+int
+zpool_export(zpool_handle_t *zhp, boolean_t force)
+{
+       return (zpool_export_common(zhp, force, B_FALSE));
+}
+
+int
+zpool_export_force(zpool_handle_t *zhp)
+{
+       return (zpool_export_common(zhp, B_TRUE, B_TRUE));
+}
+
  /*
   * zpool_import() is a contracted interface. Should be kept the same
   * if possible.
@@ -1182,7 +1195,9 @@ zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                 }
  
                 if (nvlist_add_string(props,
-                   zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0) {
+                   zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 ||
+                   nvlist_add_string(props,
+                   zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
                         nvlist_free(props);
                         return (zfs_error_fmt(hdl, EZFS_NOMEM,
                             dgettext(TEXT_DOMAIN, "cannot import '%s'"),
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c

index 54de0f4..4270ac5 100644 (file)
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -796,6 +796,10 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
         cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
             "SOURCE"));
  
+       /* first property is always NAME */
+       assert(cbp->cb_proplist->pl_prop ==
+           ((type == ZFS_TYPE_POOL) ?  ZPOOL_PROP_NAME : ZFS_PROP_NAME));
+
         /*
          * Go through and calculate the widths for each column.  For the
          * 'source' column, we kludge it up by taking the worst-case scenario of
@@ -823,9 +827,13 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
                 }
  
                 /*
-                * 'VALUE' column
+                * 'VALUE' column.  The first property is always the 'name'
+                * property that was tacked on either by /sbin/zfs's
+                * zfs_do_get() or when calling zprop_expand_list(), so we
+                * ignore its width.  If the user specified the name property
+                * to display, then it will be later in the list in any case.
                  */
-               if ((pl->pl_prop != ZFS_PROP_NAME || !pl->pl_all) &&
+               if (pl != cbp->cb_proplist &&
                     pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
                         cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
  
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 73aecb2..464fe9f 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3491,6 +3491,7 @@ arc_fini(void)
         mutex_destroy(&arc_mru_ghost->arcs_mtx);
         mutex_destroy(&arc_mfu->arcs_mtx);
         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+       mutex_destroy(&arc_l2c_only->arcs_mtx);
  
         mutex_destroy(&zfs_write_limit_lock);
  
@@ -4457,7 +4458,7 @@ l2arc_fini(void)
  void
  l2arc_start(void)
  {
-       if (!(spa_mode & FWRITE))
+       if (!(spa_mode_global & FWRITE))
                 return;
  
         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
@@ -4467,7 +4468,7 @@ l2arc_start(void)
  void
  l2arc_stop(void)
  {
-       if (!(spa_mode & FWRITE))
+       if (!(spa_mode_global & FWRITE))
                 return;
  
         mutex_enter(&l2arc_feed_thr_lock);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c

index 5124014..197284e 100644 (file)
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -119,7 +119,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
          * We only want to visit blocks that have been claimed but not yet
          * replayed (or, in read-only mode, blocks that *would* be claimed).
          */
-       if (claim_txg == 0 && (spa_mode & FWRITE))
+       if (claim_txg == 0 && spa_writeable(td->td_spa))
                 return;
  
         zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c

index e77834d..8686ab9 100644 (file)
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -56,6 +56,8 @@ dnode_cons(void *arg, void *unused, int kmflag)
         rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
         mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+
         refcount_create(&dn->dn_holds);
         refcount_create(&dn->dn_tx_holds);
  
@@ -84,6 +86,7 @@ dnode_dest(void *arg, void *unused)
         rw_destroy(&dn->dn_struct_rwlock);
         mutex_destroy(&dn->dn_mtx);
         mutex_destroy(&dn->dn_dbufs_mtx);
+       cv_destroy(&dn->dn_notxholds);
         refcount_destroy(&dn->dn_holds);
         refcount_destroy(&dn->dn_tx_holds);
  
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c

index 93ea8aa..e488b2b 100644 (file)
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1948,6 +1948,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
         if (ds->ds_phys->ds_next_snap_obj) {
                 stat->dds_is_snapshot = B_TRUE;
                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+       } else {
+               stat->dds_is_snapshot = B_FALSE;
+               stat->dds_num_clones = 0;
         }
  
         /* clone origin is really a dsl_dir thing... */
@@ -1959,6 +1962,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
                 dsl_dataset_name(ods, stat->dds_origin);
                 dsl_dataset_drop_ref(ods, FTAG);
+       } else {
+               stat->dds_origin[0] = '\0';
         }
         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
  }
diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c

index 950a91f..dbdfe8c 100644 (file)
--- a/module/zfs/dsl_scrub.c
+++ b/module/zfs/dsl_scrub.c
@@ -391,7 +391,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
          * We only want to visit blocks that have been claimed but not yet
          * replayed (or, in read-only mode, blocks that *would* be claimed).
          */
-       if (claim_txg == 0 && (spa_mode & FWRITE))
+       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
                 return;
  
         zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -409,9 +409,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
         int err;
         arc_buf_t *buf = NULL;
  
-       if (bp->blk_birth == 0)
-               return;
-
         if (bp->blk_birth <= dp->dp_scrub_min_txg)
                 return;
  
@@ -740,6 +737,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
  void
  dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
  {
+       spa_t *spa = dp->dp_spa;
         zap_cursor_t zc;
         zap_attribute_t za;
         boolean_t complete = B_TRUE;
@@ -747,8 +745,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
         if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
                 return;
  
-       /* If the spa is not fully loaded, don't bother. */
-       if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+       /*
+        * If the pool is not loaded, or is trying to unload, leave it alone.
+        */
+       if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
                 return;
  
         if (dp->dp_scrub_restart) {
@@ -757,13 +757,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
         }
  
-       if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+       if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
                 /*
                  * We must have resumed after rebooting; reset the vdev
                  * stats to know that we're doing a scrub (although it
                  * will think we're just starting now).
                  */
-               vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+               vdev_scrub_stat_update(spa->spa_root_vdev,
                     dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
                     POOL_SCRUB_EVERYTHING, B_FALSE);
         }
@@ -771,7 +771,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
         dp->dp_scrub_pausing = B_FALSE;
         dp->dp_scrub_start_time = lbolt64;
         dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-       dp->dp_spa->spa_scrub_active = B_TRUE;
+       spa->spa_scrub_active = B_TRUE;
  
         if (dp->dp_scrub_bookmark.zb_objset == 0) {
                 /* First do the MOS & ORIGIN */
@@ -779,8 +779,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 if (dp->dp_scrub_pausing)
                         goto out;
  
-               if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-                       VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+               if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+                       VERIFY(0 == dmu_objset_find_spa(spa,
                             NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
                 } else {
                         scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
@@ -830,15 +830,13 @@ out:
         VERIFY(0 == zap_update(dp->dp_meta_objset,
             DMU_POOL_DIRECTORY_OBJECT,
             DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-           &dp->dp_spa->spa_scrub_errors, tx));
+           &spa->spa_scrub_errors, tx));
  
         /* XXX this is scrub-clean specific */
-       mutex_enter(&dp->dp_spa->spa_scrub_lock);
-       while (dp->dp_spa->spa_scrub_inflight > 0) {
-               cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-                   &dp->dp_spa->spa_scrub_lock);
-       }
-       mutex_exit(&dp->dp_spa->spa_scrub_lock);
+       mutex_enter(&spa->spa_scrub_lock);
+       while (spa->spa_scrub_inflight > 0)
+               cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+       mutex_exit(&spa->spa_scrub_lock);
  }
  
  void
@@ -920,13 +918,17 @@ static int
  dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
      const blkptr_t *bp, const zbookmark_t *zb)
  {
-       size_t size = BP_GET_LSIZE(bp);
-       int d;
+       size_t size = BP_GET_PSIZE(bp);
         spa_t *spa = dp->dp_spa;
         boolean_t needs_io;
-       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
         int zio_priority;
  
+       ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
+
+       if (bp->blk_birth >= dp->dp_scrub_max_txg)
+               return (0);
+
         count_block(dp->dp_blkstats, bp);
  
         if (dp->dp_scrub_isresilver == 0) {
@@ -945,7 +947,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
         if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
                 zio_flags |= ZIO_FLAG_SPECULATIVE;
  
-       for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+       for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
                 vdev_t *vd = vdev_lookup_top(spa,
                     DVA_GET_VDEV(&bp->blk_dva[d]));
  
@@ -963,16 +965,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
                         if (DVA_GET_GANG(&bp->blk_dva[d])) {
                                 /*
                                  * Gang members may be spread across multiple
-                                * vdevs, so the best we can do is look at the
-                                * pool-wide DTL.
+                                * vdevs, so the best estimate we have is the
+                                * scrub range, which has already been checked.
                                  * XXX -- it would be better to change our
-                                * allocation policy to ensure that this can't
-                                * happen.
+                                * allocation policy to ensure that all
+                                * gang members reside on the same vdev.
                                  */
-                               vd = spa->spa_root_vdev;
+                               needs_io = B_TRUE;
+                       } else {
+                               needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+                                   bp->blk_birth, 1);
                         }
-                       needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
-                           bp->blk_birth, 1);
                 }
         }
  
diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h

index 24b3ca4..519b1d0 100644 (file)
--- a/module/zfs/include/sys/spa.h
+++ b/module/zfs/include/sys/spa.h
@@ -332,7 +332,8 @@ extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
  extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
  extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
  extern int spa_destroy(char *pool);
-extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force);
+extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+    boolean_t hardforce);
  extern int spa_reset(char *pool);
  extern void spa_async_request(spa_t *spa, int flag);
  extern void spa_async_unrequest(spa_t *spa, int flag);
@@ -351,7 +352,8 @@ extern void spa_inject_delref(spa_t *spa);
  extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
  extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
      int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
+    int replace_done);
  extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
  extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
  
@@ -475,6 +477,8 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
  extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
  extern boolean_t spa_has_slogs(spa_t *spa);
  extern boolean_t spa_is_root(spa_t *spa);
+extern boolean_t spa_writeable(spa_t *spa);
+extern int spa_mode(spa_t *spa);
  
  /* history logging */
  typedef enum history_log_type {
@@ -545,7 +549,7 @@ _NOTE(CONSTCOND) } while (0)
  #define        dprintf_bp(bp, fmt, ...)
  #endif
  
-extern int spa_mode;                   /* mode, e.g. FREAD | FWRITE */
+extern int spa_mode_global;                    /* mode, e.g. FREAD | FWRITE */
  
  #ifdef __cplusplus
  }
diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h

index 8aeb414..588b4f5 100644 (file)
--- a/module/zfs/include/sys/spa_impl.h
+++ b/module/zfs/include/sys/spa_impl.h
@@ -170,6 +170,7 @@ struct spa {
         boolean_t       spa_import_faulted;     /* allow faulted vdevs */
         boolean_t       spa_is_root;            /* pool is root */
         int             spa_minref;             /* num refs when first opened */
+       int             spa_mode;               /* FREAD | FWRITE */
         spa_log_state_t spa_log_state;          /* log state */
         /*
          * spa_refcnt & spa_config_lock must be the last elements
diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h

index db9daef..8d78606 100644 (file)
--- a/module/zfs/include/sys/space_map.h
+++ b/module/zfs/include/sys/space_map.h
@@ -19,15 +19,13 @@
   * CDDL HEADER END
   */
  /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
  #ifndef _SYS_SPACE_MAP_H
  #define        _SYS_SPACE_MAP_H
  
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
  #include <sys/avl.h>
  #include <sys/dmu.h>
  
@@ -58,6 +56,12 @@ typedef struct space_seg {
         uint64_t        ss_end;         /* ending offset (non-inclusive) */
  } space_seg_t;
  
+typedef struct space_ref {
+       avl_node_t      sr_node;        /* AVL node */
+       uint64_t        sr_offset;      /* offset (start or end) */
+       int64_t         sr_refcnt;      /* associated reference count */
+} space_ref_t;
+
  typedef struct space_map_obj {
         uint64_t        smo_object;     /* on-disk space map object */
         uint64_t        smo_objsize;    /* size of the object */
@@ -133,13 +137,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
  extern void space_map_destroy(space_map_t *sm);
  extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
  extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern boolean_t space_map_contains(space_map_t *sm,
+    uint64_t start, uint64_t size);
  extern void space_map_vacate(space_map_t *sm,
      space_map_func_t *func, space_map_t *mdest);
  extern void space_map_walk(space_map_t *sm,
      space_map_func_t *func, space_map_t *mdest);
-extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_union(space_map_t *smd, space_map_t *sms);
  
  extern void space_map_load_wait(space_map_t *sm);
  extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
@@ -155,6 +158,15 @@ extern void space_map_sync(space_map_t *sm, uint8_t maptype,
  extern void space_map_truncate(space_map_obj_t *smo,
      objset_t *os, dmu_tx_t *tx);
  
+extern void space_map_ref_create(avl_tree_t *t);
+extern void space_map_ref_destroy(avl_tree_t *t);
+extern void space_map_ref_add_seg(avl_tree_t *t,
+    uint64_t start, uint64_t end, int64_t refcnt);
+extern void space_map_ref_add_map(avl_tree_t *t,
+    space_map_t *sm, int64_t refcnt);
+extern void space_map_ref_generate_map(avl_tree_t *t,
+    space_map_t *sm, int64_t minref);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/module/zfs/include/sys/uberblock_impl.h b/module/zfs/include/sys/uberblock_impl.h

index 55a0dd5..b49df8a 100644 (file)
--- a/module/zfs/include/sys/uberblock_impl.h
+++ b/module/zfs/include/sys/uberblock_impl.h
@@ -19,15 +19,13 @@
   * CDDL HEADER END
   */
  /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
  #ifndef _SYS_UBERBLOCK_IMPL_H
  #define        _SYS_UBERBLOCK_IMPL_H
  
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
  #include <sys/uberblock.h>
  
  #ifdef __cplusplus
@@ -35,6 +33,11 @@ extern "C" {
  #endif
  
  /*
+ * For zdb use and debugging purposes only
+ */
+extern uint64_t ub_max_txg;
+
+/*
   * The uberblock version is incremented whenever an incompatible on-disk
   * format change is made to the SPA, DMU, or ZAP.
   *
diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h

index c070d6f..b8313a9 100644 (file)
--- a/module/zfs/include/sys/vdev.h
+++ b/module/zfs/include/sys/vdev.h
@@ -36,6 +36,14 @@
  extern "C" {
  #endif
  
+typedef enum vdev_dtl_type {
+       DTL_MISSING,    /* 0% replication: no copies of the data */
+       DTL_PARTIAL,    /* less than 100% replication: some copies missing */
+       DTL_SCRUB,      /* unable to fully repair during scrub/resilver */
+       DTL_OUTAGE,     /* temporarily missing (used to attempt detach) */
+       DTL_TYPES
+} vdev_dtl_type_t;
+
  extern boolean_t zfs_nocacheflush;
  
  extern int vdev_open(vdev_t *);
@@ -50,10 +58,14 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
  extern boolean_t vdev_is_bootable(vdev_t *vd);
  extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
  extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
-extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
  extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
      int scrub_done);
+extern boolean_t vdev_dtl_required(vdev_t *vd);
  extern boolean_t vdev_resilver_needed(vdev_t *vd,
      uint64_t *minp, uint64_t *maxp);
  
diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h

index 26904d0..6523493 100644 (file)
--- a/module/zfs/include/sys/vdev_impl.h
+++ b/module/zfs/include/sys/vdev_impl.h
@@ -123,8 +123,7 @@ struct vdev {
         vdev_t          *vdev_parent;   /* parent vdev                  */
         vdev_t          **vdev_child;   /* array of children            */
         uint64_t        vdev_children;  /* number of children           */
-       space_map_t     vdev_dtl_map;   /* dirty time log in-core state */
-       space_map_t     vdev_dtl_scrub; /* DTL for scrub repair writes  */
+       space_map_t     vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
         vdev_stat_t     vdev_stat;      /* virtual device statistics    */
  
         /*
@@ -149,7 +148,7 @@ struct vdev {
          * Leaf vdev state.
          */
         uint64_t        vdev_psize;     /* physical device capacity     */
-       space_map_obj_t vdev_dtl;       /* dirty time log on-disk state */
+       space_map_obj_t vdev_dtl_smo;   /* dirty time log space map obj */
         txg_node_t      vdev_dtl_node;  /* per-txg dirty DTL linkage    */
         uint64_t        vdev_wholedisk; /* true if this is a whole disk */
         uint64_t        vdev_offline;   /* persistent offline state     */
diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h

index 87b75e6..7e0440b 100644 (file)
--- a/module/zfs/include/sys/zfs_vfsops.h
+++ b/module/zfs/include/sys/zfs_vfsops.h
@@ -26,8 +26,6 @@
  #ifndef        _SYS_FS_ZFS_VFSOPS_H
  #define        _SYS_FS_ZFS_VFSOPS_H
  
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
  #include <sys/isa_defs.h>
  #include <sys/types32.h>
  #include <sys/list.h>
@@ -49,7 +47,6 @@ struct zfsvfs {
         uint64_t        z_root;         /* id of root znode */
         uint64_t        z_unlinkedobj;  /* id of unlinked zapobj */
         uint64_t        z_max_blksz;    /* maximum block size for files */
-       uint64_t        z_assign;       /* TXG_NOWAIT or set by zil_replay() */
         uint64_t        z_fuid_obj;     /* fuid table object number */
         uint64_t        z_fuid_size;    /* fuid table size */
         avl_tree_t      z_fuid_idx;     /* fuid tree keyed by index */
@@ -74,6 +71,7 @@ struct zfsvfs {
         boolean_t       z_issnap;       /* true if this is a snapshot */
         boolean_t       z_vscan;        /* virus scan on/off */
         boolean_t       z_use_fuids;    /* version allows fuids */
+       boolean_t       z_replay;       /* set during ZIL replay */
         kmutex_t        z_online_recv_lock; /* recv in prog grabs as WRITER */
         uint64_t        z_version;      /* ZPL version */
  #define        ZFS_OBJ_MTX_SZ  64
diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h

index 4d02d14..b69323c 100644 (file)
--- a/module/zfs/include/sys/zil.h
+++ b/module/zfs/include/sys/zil.h
@@ -335,7 +335,6 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
  typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
      uint64_t txg);
  typedef int zil_replay_func_t();
-typedef void zil_replay_cleaner_t();
  typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
  
  extern uint64_t        zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@@ -350,9 +349,8 @@ extern void zil_free(zilog_t *zilog);
  extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
  extern void    zil_close(zilog_t *zilog);
  
-extern void    zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE],
-    zil_replay_cleaner_t *replay_cleaner);
+extern void    zil_replay(objset_t *os, void *arg,
+    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
  extern void    zil_destroy(zilog_t *zilog, boolean_t keep_first);
  extern void    zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
  
diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h

index 0fc800b..3f25829 100644 (file)
--- a/module/zfs/include/sys/zil_impl.h
+++ b/module/zfs/include/sys/zil_impl.h
@@ -19,15 +19,13 @@
   * CDDL HEADER END
   */
  /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
  #ifndef        _SYS_ZIL_IMPL_H
  #define        _SYS_ZIL_IMPL_H
  
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
  #include <sys/zil.h>
  #include <sys/dmu_objset.h>
  
@@ -74,13 +72,14 @@ struct zilog {
         uint64_t        zl_commit_seq;  /* committed upto this number */
         uint64_t        zl_lr_seq;      /* log record sequence number */
         uint64_t        zl_destroy_txg; /* txg of last zil_destroy() */
-       uint64_t        zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+       uint64_t        zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
+       uint64_t        zl_replaying_seq; /* current replay seq number */
         uint32_t        zl_suspend;     /* log suspend count */
         kcondvar_t      zl_cv_writer;   /* log writer thread completion */
         kcondvar_t      zl_cv_suspend;  /* log suspend completion */
         uint8_t         zl_suspending;  /* log is currently suspending */
         uint8_t         zl_keep_first;  /* keep first log block in destroy */
-       uint8_t         zl_stop_replay; /* don't replay any further */
+       uint8_t         zl_replay;      /* replaying records while set */
         uint8_t         zl_stop_sync;   /* for debugging */
         uint8_t         zl_writer;      /* boolean: write setup in progress */
         uint8_t         zl_log_error;   /* boolean: log write error */
diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h

index 4de78df..21b0fbc 100644 (file)
--- a/module/zfs/include/sys/zio.h
+++ b/module/zfs/include/sys/zio.h
@@ -132,12 +132,14 @@ enum zio_compress {
  #define        ZIO_FLAG_IO_RETRY               0x00400
  #define        ZIO_FLAG_IO_REWRITE             0x00800
  
-#define        ZIO_FLAG_PROBE                  0x01000
+#define        ZIO_FLAG_SELF_HEAL              0x01000
  #define        ZIO_FLAG_RESILVER               0x02000
  #define        ZIO_FLAG_SCRUB                  0x04000
  #define        ZIO_FLAG_SCRUB_THREAD           0x08000
  
-#define        ZIO_FLAG_GANG_CHILD             0x10000
+#define        ZIO_FLAG_PROBE                  0x10000
+#define        ZIO_FLAG_GANG_CHILD             0x20000
+#define        ZIO_FLAG_RAW                    0x40000
  
  #define        ZIO_FLAG_GANG_INHERIT           \
         (ZIO_FLAG_CANFAIL |             \
@@ -146,6 +148,7 @@ enum zio_compress {
         ZIO_FLAG_DONT_RETRY |           \
         ZIO_FLAG_DONT_CACHE |           \
         ZIO_FLAG_DONT_AGGREGATE |       \
+       ZIO_FLAG_SELF_HEAL |            \
         ZIO_FLAG_RESILVER |             \
         ZIO_FLAG_SCRUB |                \
         ZIO_FLAG_SCRUB_THREAD)
@@ -156,6 +159,14 @@ enum zio_compress {
         ZIO_FLAG_IO_RETRY |             \
         ZIO_FLAG_PROBE)
  
+#define        ZIO_FLAG_AGG_INHERIT            \
+       (ZIO_FLAG_DONT_AGGREGATE |      \
+       ZIO_FLAG_IO_REPAIR |            \
+       ZIO_FLAG_SELF_HEAL |            \
+       ZIO_FLAG_RESILVER |             \
+       ZIO_FLAG_SCRUB |                \
+       ZIO_FLAG_SCRUB_THREAD)
+
  #define        ZIO_PIPELINE_CONTINUE           0x100
  #define        ZIO_PIPELINE_STOP               0x101
  
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index 87727fa..4128329 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -720,6 +720,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
         vdev_t *vd;
         int dshift = 3;
         int all_zero;
+       int zio_lock = B_FALSE;
+       boolean_t allocatable;
         uint64_t offset = -1ULL;
         uint64_t asize;
         uint64_t distance;
@@ -778,11 +780,20 @@ top:
         all_zero = B_TRUE;
         do {
                 vd = mg->mg_vd;
+
                 /*
                  * Don't allocate from faulted devices.
                  */
-               if (!vdev_allocatable(vd))
+               if (zio_lock) {
+                       spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+                       allocatable = vdev_allocatable(vd);
+                       spa_config_exit(spa, SCL_ZIO, FTAG);
+               } else {
+                       allocatable = vdev_allocatable(vd);
+               }
+               if (!allocatable)
                         goto next;
+
                 /*
                  * Avoid writing single-copy data to a failing vdev
                  */
@@ -858,6 +869,12 @@ next:
                 goto top;
         }
  
+       if (!zio_lock) {
+               dshift = 3;
+               zio_lock = B_TRUE;
+               goto top;
+       }
+
         bzero(&dva[d], sizeof (dva_t));
  
         return (ENOSPC);
@@ -946,7 +963,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
  
         space_map_claim(&msp->ms_map, offset, size);
  
-       if (spa_mode & FWRITE) {        /* don't dirty if we're zdb(1M) */
+       if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
                 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
                 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index fb1b96f..ef04b7c 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -486,11 +486,12 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
   * Activate an uninitialized pool.
   */
  static void
-spa_activate(spa_t *spa)
+spa_activate(spa_t *spa, int mode)
  {
         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
  
         spa->spa_state = POOL_STATE_ACTIVE;
+       spa->spa_mode = mode;
  
         spa->spa_normal_class = metaslab_class_create();
         spa->spa_log_class = metaslab_class_create();
@@ -640,11 +641,6 @@ spa_unload(spa_t *spa)
         mutex_exit(&spa->spa_async_root_lock);
  
         /*
-        * Drop and purge level 2 cache
-        */
-       spa_l2cache_drop(spa);
-
-       /*
          * Close the dsl pool.
          */
         if (spa->spa_dsl_pool) {
@@ -652,6 +648,13 @@ spa_unload(spa_t *spa)
                 spa->spa_dsl_pool = NULL;
         }
  
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+       /*
+        * Drop and purge level 2 cache
+        */
+       spa_l2cache_drop(spa);
+
         /*
          * Close all vdevs.
          */
@@ -686,6 +689,8 @@ spa_unload(spa_t *spa)
         spa->spa_l2cache.sav_count = 0;
  
         spa->spa_async_suspended = 0;
+
+       spa_config_exit(spa, SCL_ALL, FTAG);
  }
  
  /*
@@ -897,12 +902,9 @@ spa_load_l2cache(spa_t *spa)
  
                 vd = oldvdevs[i];
                 if (vd != NULL) {
-                       if ((spa_mode & FWRITE) &&
-                           spa_l2cache_exists(vd->vdev_guid, &pool) &&
-                           pool != 0ULL &&
-                           l2arc_vdev_present(vd)) {
+                       if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+                           pool != 0ULL && l2arc_vdev_present(vd))
                                 l2arc_remove_vdev(vd);
-                       }
                         (void) vdev_close(vd);
                         spa_l2cache_remove(vd);
                 }
@@ -1018,8 +1020,16 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
         uint64_t pool_guid;
         uint64_t version;
         uint64_t autoreplace = 0;
+       int orig_mode = spa->spa_mode;
         char *ereport = FM_EREPORT_ZFS_POOL;
  
+       /*
+        * If this is an untrusted config, access the pool in read-only mode.
+        * This prevents things like resilvering recently removed devices.
+        */
+       if (!mosconfig)
+               spa->spa_mode = FREAD;
+
         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  
         spa->spa_load_state = state;
@@ -1077,12 +1087,13 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
          * Validate the labels for all leaf vdevs.  We need to grab the config
          * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
          */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = vdev_validate(rvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-
-       if (error != 0)
-               goto out;
+       if (mosconfig) {
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               error = vdev_validate(rvd);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+               if (error != 0)
+                       goto out;
+       }
  
         if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
                 error = ENXIO;
@@ -1184,7 +1195,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
                 spa_config_set(spa, newconfig);
                 spa_unload(spa);
                 spa_deactivate(spa);
-               spa_activate(spa);
+               spa_activate(spa, orig_mode);
  
                 return (spa_load(spa, newconfig, state, B_TRUE));
         }
@@ -1376,10 +1387,11 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
                 goto out;
         }
  
-       if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+       if (spa_writeable(spa)) {
                 dmu_tx_t *tx;
                 int need_update = B_FALSE;
-               int c;
+
+               ASSERT(state != SPA_LOAD_TRYIMPORT);
  
                 /*
                  * Claim log blocks that haven't been committed yet.
@@ -1407,7 +1419,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
                     state == SPA_LOAD_IMPORT)
                         need_update = B_TRUE;
  
-               for (c = 0; c < rvd->vdev_children; c++)
+               for (int c = 0; c < rvd->vdev_children; c++)
                         if (rvd->vdev_child[c]->vdev_ms_array == 0)
                                 need_update = B_TRUE;
  
@@ -1417,6 +1429,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
                  */
                 if (need_update)
                         spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+               /*
+                * Check all DTLs to see if anything needs resilvering.
+                */
+               if (vdev_resilver_needed(rvd, NULL, NULL))
+                       spa_async_request(spa, SPA_ASYNC_RESILVER);
         }
  
         error = 0;
@@ -1469,7 +1487,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
         }
         if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
  
-               spa_activate(spa);
+               spa_activate(spa, spa_mode_global);
  
                 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
  
@@ -1873,11 +1891,9 @@ spa_l2cache_drop(spa_t *spa)
                 vd = sav->sav_vdevs[i];
                 ASSERT(vd != NULL);
  
-               if ((spa_mode & FWRITE) &&
-                   spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
-                   l2arc_vdev_present(vd)) {
+               if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+                   pool != 0ULL && l2arc_vdev_present(vd))
                         l2arc_remove_vdev(vd);
-               }
                 if (vd->vdev_isl2cache)
                         spa_l2cache_remove(vd);
                 vdev_clear_stats(vd);
@@ -1918,7 +1934,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
         (void) nvlist_lookup_string(props,
             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
         spa = spa_add(pool, altroot);
-       spa_activate(spa);
+       spa_activate(spa, spa_mode_global);
  
         spa->spa_uberblock.ub_txg = txg - 1;
  
@@ -2121,7 +2137,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
         (void) nvlist_lookup_string(props,
             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
         spa = spa_add(pool, altroot);
-       spa_activate(spa);
+       spa_activate(spa, spa_mode_global);
  
         if (allowfaulted)
                 spa->spa_import_faulted = B_TRUE;
@@ -2160,7 +2176,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
                     VDEV_ALLOC_L2CACHE);
         spa_config_exit(spa, SCL_ALL, FTAG);
  
-       if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
+       if (error != 0 || (props && spa_writeable(spa) &&
+           (error = spa_prop_set(spa, props)))) {
                 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
                         /*
                          * If we failed to load the pool, but 'allowfaulted' is
@@ -2219,7 +2236,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
                 spa->spa_l2cache.sav_sync = B_TRUE;
         }
  
-       if (spa_mode & FWRITE) {
+       if (spa_writeable(spa)) {
                 /*
                  * Update the config cache to include the newly-imported pool.
                  */
@@ -2367,11 +2384,11 @@ spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
                 char *cdevid, *cpath;
                 uint64_t tmptxg;
  
+               cpath = NULL;
+               cdevid = NULL;
                 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
-                   &cpath) != 0)
-                       return (EINVAL);
-               if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
-                   &cdevid) != 0)
+                   &cpath) != 0 && nvlist_lookup_string(child[c],
+                   ZPOOL_CONFIG_DEVID, &cdevid) != 0)
                         return (EINVAL);
                 if ((spa_check_rootconf(cpath, cdevid, NULL,
                     &tmptxg) == 0) && (tmptxg > txg)) {
@@ -2489,7 +2506,7 @@ spa_tryimport(nvlist_t *tryconfig)
          */
         mutex_enter(&spa_namespace_lock);
         spa = spa_add(TRYIMPORT_NAME, NULL);
-       spa_activate(spa);
+       spa_activate(spa, FREAD);
  
         /*
          * Pass off the heavy lifting to spa_load().
@@ -2563,18 +2580,19 @@ spa_tryimport(nvlist_t *tryconfig)
   * The act of destroying or exporting a pool is very simple.  We make sure there
   * is no more pending I/O and any references to the pool are gone.  Then, we
   * update the pool state and sync all the labels to disk, removing the
- * configuration from the cache afterwards.
+ * configuration from the cache afterwards. If the 'hardforce' flag is set, then
+ * we don't sync the labels or remove the configuration cache.
   */
  static int
  spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
-    boolean_t force)
+    boolean_t force, boolean_t hardforce)
  {
         spa_t *spa;
  
         if (oldconfig)
                 *oldconfig = NULL;
  
-       if (!(spa_mode & FWRITE))
+       if (!(spa_mode_global & FWRITE))
                 return (EROFS);
  
         mutex_enter(&spa_namespace_lock);
@@ -2635,7 +2653,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
                  * so mark them all dirty.  spa_unload() will do the
                  * final sync that pushes these changes out.
                  */
-               if (new_state != POOL_STATE_UNINITIALIZED) {
+               if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
                         spa->spa_state = new_state;
                         spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
@@ -2655,7 +2673,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
                 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
  
         if (new_state != POOL_STATE_UNINITIALIZED) {
-               spa_config_sync(spa, B_TRUE, B_TRUE);
+               if (!hardforce)
+                       spa_config_sync(spa, B_TRUE, B_TRUE);
                 spa_remove(spa);
         }
         mutex_exit(&spa_namespace_lock);
@@ -2669,16 +2688,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
  int
  spa_destroy(char *pool)
  {
-       return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE));
+       return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
+           B_FALSE, B_FALSE));
  }
  
  /*
   * Export a storage pool.
   */
  int
-spa_export(char *pool, nvlist_t **oldconfig, boolean_t force)
+spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+    boolean_t hardforce)
  {
-       return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force));
+       return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
+           force, hardforce));
  }
  
  /*
@@ -2689,7 +2711,7 @@ int
  spa_reset(char *pool)
  {
         return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
-           B_FALSE));
+           B_FALSE, B_FALSE));
  }
  
  /*
@@ -2705,7 +2727,7 @@ int
  spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  {
         uint64_t txg;
-       int c, error;
+       int error;
         vdev_t *rvd = spa->spa_root_vdev;
         vdev_t *vd, *tvd;
         nvlist_t **spares, **l2cache;
@@ -2744,7 +2766,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
         /*
          * Transfer each new top-level vdev from vd to rvd.
          */
-       for (c = 0; c < vd->vdev_children; c++) {
+       for (int c = 0; c < vd->vdev_children; c++) {
                 tvd = vd->vdev_child[c];
                 vdev_remove_child(vd, tvd);
                 tvd->vdev_id = rvd->vdev_children;
@@ -2952,10 +2974,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
          */
         open_txg = txg + TXG_CONCURRENT_STATES - 1;
  
-       mutex_enter(&newvd->vdev_dtl_lock);
-       space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
-           open_txg - TXG_INITIAL + 1);
-       mutex_exit(&newvd->vdev_dtl_lock);
+       vdev_dtl_dirty(newvd, DTL_MISSING,
+           TXG_INITIAL, open_txg - TXG_INITIAL + 1);
  
         if (newvd->vdev_isspare)
                 spa_spare_activate(newvd);
@@ -2999,10 +3019,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
   * is a replacing vdev.
   */
  int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
  {
         uint64_t txg;
-       int c, t, error;
+       int error;
         vdev_t *rvd = spa->spa_root_vdev;
         vdev_t *vd, *pvd, *cvd, *tvd;
         boolean_t unspare = B_FALSE;
@@ -3022,6 +3042,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
         pvd = vd->vdev_parent;
  
         /*
+        * If the parent/child relationship is not as expected, don't do it.
+        * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+        * vdev that's replacing B with C.  The user's intent in replacing
+        * is to go from M(A,B) to M(A,C).  If the user decides to cancel
+        * the replace by detaching C, the expected behavior is to end up
+        * M(A,B).  But suppose that right after deciding to detach C,
+        * the replacement of B completes.  We would have M(A,C), and then
+        * ask to detach C, which would leave us with just A -- not what
+        * the user wanted.  To prevent this, we make sure that the
+        * parent/child relationship hasn't changed -- in this example,
+        * that C's parent is still the replacing vdev R.
+        */
+       if (pvd->vdev_guid != pguid && pguid != 0)
+               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+       /*
          * If replace_done is specified, only remove this device if it's
          * the first child of a replacing vdev.  For the 'spare' vdev, either
          * disk can be removed.
@@ -3047,36 +3083,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
  
         /*
-        * If there's only one replica, you can't detach it.
+        * If this device has the only valid copy of some data,
+        * we cannot safely detach it.
          */
-       if (pvd->vdev_children <= 1)
+       if (vdev_dtl_required(vd))
                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
  
-       /*
-        * If all siblings have non-empty DTLs, this device may have the only
-        * valid copy of the data, which means we cannot safely detach it.
-        *
-        * XXX -- as in the vdev_offline() case, we really want a more
-        * precise DTL check.
-        */
-       for (c = 0; c < pvd->vdev_children; c++) {
-               uint64_t dirty;
-
-               cvd = pvd->vdev_child[c];
-               if (cvd == vd)
-                       continue;
-               if (vdev_is_dead(cvd))
-                       continue;
-               mutex_enter(&cvd->vdev_dtl_lock);
-               dirty = cvd->vdev_dtl_map.sm_space |
-                   cvd->vdev_dtl_scrub.sm_space;
-               mutex_exit(&cvd->vdev_dtl_lock);
-               if (!dirty)
-                       break;
-       }
-
-       if (c == pvd->vdev_children)
-               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+       ASSERT(pvd->vdev_children >= 2);
  
         /*
          * If we are detaching the second disk from a replacing vdev, then
@@ -3102,7 +3115,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
          * active spare list for the pool.
          */
         if (pvd->vdev_ops == &vdev_spare_ops &&
-           vd->vdev_id == 0)
+           vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
                 unspare = B_TRUE;
  
         /*
@@ -3128,14 +3141,18 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
  
         /*
          * If we need to remove the remaining child from the list of hot spares,
-        * do it now, marking the vdev as no longer a spare in the process.  We
-        * must do this before vdev_remove_parent(), because that can change the
-        * GUID if it creates a new toplevel GUID.
+        * do it now, marking the vdev as no longer a spare in the process.
+        * We must do this before vdev_remove_parent(), because that can
+        * change the GUID if it creates a new toplevel GUID.  For a similar
+        * reason, we must remove the spare now, in the same txg as the detach;
+        * otherwise someone could attach a new sibling, change the GUID, and
+        * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
          */
         if (unspare) {
                 ASSERT(cvd->vdev_isspare);
                 spa_spare_remove(cvd);
                 unspare_guid = cvd->vdev_guid;
+               (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
         }
  
         /*
@@ -3173,7 +3190,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
          * But first make sure we're not on any *other* txg's DTL list, to
          * prevent vd from being accessed after it's freed.
          */
-       for (t = 0; t < TXG_SIZE; t++)
+       for (int t = 0; t < TXG_SIZE; t++)
                 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
         vd->vdev_detached = B_TRUE;
         vdev_dirty(tvd, VDD_DTL, vd, txg);
@@ -3188,11 +3205,14 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
          * list of every other pool.
          */
         if (unspare) {
+               spa_t *myspa = spa;
                 spa = NULL;
                 mutex_enter(&spa_namespace_lock);
                 while ((spa = spa_next(spa)) != NULL) {
                         if (spa->spa_state != POOL_STATE_ACTIVE)
                                 continue;
+                       if (spa == myspa)
+                               continue;
                         spa_open_ref(spa, FTAG);
                         mutex_exit(&spa_namespace_lock);
                         (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
@@ -3256,10 +3276,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
         vdev_t *vd;
         nvlist_t **spares, **l2cache, *nv;
         uint_t nspares, nl2cache;
-       uint64_t txg;
+       uint64_t txg = 0;
         int error = 0;
+       boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
  
-       txg = spa_vdev_enter(spa);
+       if (!locked)
+               txg = spa_vdev_enter(spa);
  
         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
  
@@ -3302,7 +3324,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                 error = ENOENT;
         }
  
-       return (spa_vdev_exit(spa, NULL, txg, error));
+       if (!locked)
+               return (spa_vdev_exit(spa, NULL, txg, error));
+
+       return (error);
  }
  
  /*
@@ -3328,13 +3353,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
                 oldvd = vd->vdev_child[0];
                 newvd = vd->vdev_child[1];
  
-               mutex_enter(&newvd->vdev_dtl_lock);
-               if (newvd->vdev_dtl_map.sm_space == 0 &&
-                   newvd->vdev_dtl_scrub.sm_space == 0) {
-                       mutex_exit(&newvd->vdev_dtl_lock);
+               if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+                   !vdev_dtl_required(oldvd))
                         return (oldvd);
-               }
-               mutex_exit(&newvd->vdev_dtl_lock);
         }
  
         /*
@@ -3344,15 +3365,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
                 newvd = vd->vdev_child[0];
                 oldvd = vd->vdev_child[1];
  
-               mutex_enter(&newvd->vdev_dtl_lock);
                 if (newvd->vdev_unspare &&
-                   newvd->vdev_dtl_map.sm_space == 0 &&
-                   newvd->vdev_dtl_scrub.sm_space == 0) {
+                   vdev_dtl_empty(newvd, DTL_MISSING) &&
+                   !vdev_dtl_required(oldvd)) {
                         newvd->vdev_unspare = 0;
-                       mutex_exit(&newvd->vdev_dtl_lock);
                         return (oldvd);
                 }
-               mutex_exit(&newvd->vdev_dtl_lock);
         }
  
         return (NULL);
@@ -3361,36 +3379,37 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
  static void
  spa_vdev_resilver_done(spa_t *spa)
  {
-       vdev_t *vd;
-       vdev_t *pvd;
-       uint64_t guid;
-       uint64_t pguid = 0;
+       vdev_t *vd, *pvd, *ppvd;
+       uint64_t guid, sguid, pguid, ppguid;
  
-       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
  
         while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+               pvd = vd->vdev_parent;
+               ppvd = pvd->vdev_parent;
                 guid = vd->vdev_guid;
+               pguid = pvd->vdev_guid;
+               ppguid = ppvd->vdev_guid;
+               sguid = 0;
                 /*
                  * If we have just finished replacing a hot spared device, then
                  * we need to detach the parent's first child (the original hot
                  * spare) as well.
                  */
-               pvd = vd->vdev_parent;
-               if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-                   pvd->vdev_id == 0) {
+               if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
                         ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-                       ASSERT(pvd->vdev_parent->vdev_children == 2);
-                       pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+                       ASSERT(ppvd->vdev_children == 2);
+                       sguid = ppvd->vdev_child[1]->vdev_guid;
                 }
-               spa_config_exit(spa, SCL_CONFIG, FTAG);
-               if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+               spa_config_exit(spa, SCL_ALL, FTAG);
+               if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
                         return;
-               if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+               if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
                         return;
-               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
         }
  
-       spa_config_exit(spa, SCL_CONFIG, FTAG);
+       spa_config_exit(spa, SCL_ALL, FTAG);
  }
  
  /*
@@ -3925,9 +3944,22 @@ spa_sync(spa_t *spa, uint64_t txg)
          * into config changes that go out with this transaction group.
          */
         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-       while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-               vdev_state_clean(vd);
-               vdev_config_dirty(vd);
+       while (list_head(&spa->spa_state_dirty_list) != NULL) {
+               /*
+                * We need the write lock here because, for aux vdevs,
+                * calling vdev_config_dirty() modifies sav_config.
+                * This is ugly and will become unnecessary when we
+                * eliminate the aux vdev wart by integrating all vdevs
+                * into the root vdev tree.
+                */
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+               while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+                       vdev_state_clean(vd);
+                       vdev_config_dirty(vd);
+               }
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
         }
         spa_config_exit(spa, SCL_STATE, FTAG);
  
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c

index ee425a9..252869d 100644 (file)
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -208,6 +208,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
  
         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  
+       if (rootdir == NULL)
+               return;
+
         /*
          * Iterate over all cachefiles for the pool, past or present.  When the
          * cachefile is changed, the new one is pushed onto this list, allowing
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 36046e6..485e83f 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock;
  static avl_tree_t spa_l2cache_avl;
  
  kmem_cache_t *spa_buffer_pool;
-int spa_mode;
+int spa_mode_global;
  
  #ifdef ZFS_DEBUG
  /* Everything except dprintf is on by default in debug builds */
@@ -880,8 +880,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
                 txg_wait_synced(spa->spa_dsl_pool, txg);
  
         if (vd != NULL) {
-               ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+               ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+               spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
                 vdev_free(vd);
+               spa_config_exit(spa, SCL_ALL, spa);
         }
  
         /*
@@ -912,6 +914,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
  
         spa_config_exit(spa, SCL_STATE_ALL, spa);
  
+       /*
+        * If anything changed, wait for it to sync.  This ensures that,
+        * from the system administrator's perspective, zpool(1M) commands
+        * are synchronous.  This is important for things like zpool offline:
+        * when the command completes, you expect no further I/O from ZFS.
+        */
+       if (vd != NULL)
+               txg_wait_synced(spa->spa_dsl_pool, 0);
+
         return (error);
  }
  
@@ -1351,7 +1362,7 @@ spa_init(int mode)
         avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
             offsetof(spa_aux_t, aux_avl));
  
-       spa_mode = mode;
+       spa_mode_global = mode;
  
         refcount_init();
         unique_init();
@@ -1408,3 +1419,15 @@ spa_is_root(spa_t *spa)
  {
         return (spa->spa_is_root);
  }
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+       return (!!(spa->spa_mode & FWRITE));
+}
+
+int
+spa_mode(spa_t *spa)
+{
+       return (spa->spa_mode);
+}
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c

index 0a1fd59..1cdacc8 100644 (file)
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -23,8 +23,6 @@
   * Use is subject to license terms.
   */
  
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
  #include <sys/zfs_context.h>
  #include <sys/spa.h>
  #include <sys/dmu.h>
@@ -60,6 +58,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
  {
         bzero(sm, sizeof (*sm));
  
+       cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
+
         avl_create(&sm->sm_root, space_map_seg_compare,
             sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
  
@@ -75,6 +75,7 @@ space_map_destroy(space_map_t *sm)
         ASSERT(!sm->sm_loaded && !sm->sm_loading);
         VERIFY3U(sm->sm_space, ==, 0);
         avl_destroy(&sm->sm_root);
+       cv_destroy(&sm->sm_load_cv);
  }
  
  void
@@ -180,7 +181,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
         sm->sm_space -= size;
  }
  
-int
+boolean_t
  space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
  {
         avl_index_t where;
@@ -220,59 +221,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
  {
         space_seg_t *ss;
  
-       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-               func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
-       avl_tree_t *t = &sm->sm_root;
-       avl_index_t where;
-       space_seg_t *ss, search;
-       uint64_t end = start + size;
-       uint64_t rm_start, rm_end;
-
         ASSERT(MUTEX_HELD(sm->sm_lock));
  
-       search.ss_start = start;
-       search.ss_end = start;
-
-       for (;;) {
-               ss = avl_find(t, &search, &where);
-
-               if (ss == NULL)
-                       ss = avl_nearest(t, where, AVL_AFTER);
-
-               if (ss == NULL || ss->ss_start >= end)
-                       break;
-
-               rm_start = MAX(ss->ss_start, start);
-               rm_end = MIN(ss->ss_end, end);
-
-               space_map_remove(sm, rm_start, rm_end - rm_start);
-       }
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
-       avl_tree_t *t = &sms->sm_root;
-       space_seg_t *ss;
-
-       ASSERT(MUTEX_HELD(smd->sm_lock));
-
-       /*
-        * For each source segment, remove any intersections with the
-        * destination, then add the source segment to the destination.
-        */
-       for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-               space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-               space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-       }
+       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+               func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
  }
  
  /*
@@ -504,3 +456,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
         smo->smo_objsize = 0;
         smo->smo_alloc = 0;
  }
+
+/*
+ * Space map reference trees.
+ *
+ * A space map is a collection of integers.  Every integer is either
+ * in the map, or it's not.  A space map reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a space map.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N space maps is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N space maps is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'space map domain', so we convert the maps
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_map_ref_compare(const void *x1, const void *x2)
+{
+       const space_ref_t *sr1 = x1;
+       const space_ref_t *sr2 = x2;
+
+       if (sr1->sr_offset < sr2->sr_offset)
+               return (-1);
+       if (sr1->sr_offset > sr2->sr_offset)
+               return (1);
+
+       if (sr1 < sr2)
+               return (-1);
+       if (sr1 > sr2)
+               return (1);
+
+       return (0);
+}
+
+void
+space_map_ref_create(avl_tree_t *t)
+{
+       avl_create(t, space_map_ref_compare,
+           sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_map_ref_destroy(avl_tree_t *t)
+{
+       space_ref_t *sr;
+       void *cookie = NULL;
+
+       while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+               kmem_free(sr, sizeof (*sr));
+
+       avl_destroy(t);
+}
+
+static void
+space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+       space_ref_t *sr;
+
+       sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+       sr->sr_offset = offset;
+       sr->sr_refcnt = refcnt;
+
+       avl_add(t, sr);
+}
+
+void
+space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+       int64_t refcnt)
+{
+       space_map_ref_add_node(t, start, refcnt);
+       space_map_ref_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a space map into a reference tree.
+ */
+void
+space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+{
+       space_seg_t *ss;
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+
+       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+               space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a space map.  The space map will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+{
+       uint64_t start = -1ULL;
+       int64_t refcnt = 0;
+       space_ref_t *sr;
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+
+       space_map_vacate(sm, NULL, NULL);
+
+       for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+               refcnt += sr->sr_refcnt;
+               if (refcnt >= minref) {
+                       if (start == -1ULL) {
+                               start = sr->sr_offset;
+                       }
+               } else {
+                       if (start != -1ULL) {
+                               uint64_t end = sr->sr_offset;
+                               ASSERT(start <= end);
+                               if (end > start)
+                                       space_map_add(sm, start, end - start);
+                               start = -1ULL;
+                       }
+               }
+       }
+       ASSERT(refcnt == 0);
+       ASSERT(start == -1ULL);
+}
diff --git a/module/zfs/txg.c b/module/zfs/txg.c

index 2bbf2f0..e3c0e2a 100644 (file)
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -63,6 +63,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
         rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
         mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
  
+       cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
         tx->tx_open_txg = txg;
  }
  
@@ -80,6 +86,12 @@ txg_fini(dsl_pool_t *dp)
         rw_destroy(&tx->tx_suspend);
         mutex_destroy(&tx->tx_sync_lock);
  
+       cv_destroy(&tx->tx_sync_more_cv);
+       cv_destroy(&tx->tx_sync_done_cv);
+       cv_destroy(&tx->tx_quiesce_more_cv);
+       cv_destroy(&tx->tx_quiesce_done_cv);
+       cv_destroy(&tx->tx_exit_cv);
+
         for (c = 0; c < max_ncpus; c++) {
                 int i;
  
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index 16a27e5..d9689e8 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -316,8 +316,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
-       space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
-       space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+       for (int t = 0; t < DTL_TYPES; t++) {
+               space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+                   &vd->vdev_dtl_lock);
+       }
         txg_list_create(&vd->vdev_ms_list,
             offsetof(struct metaslab, ms_txg_node));
         txg_list_create(&vd->vdev_dtl_list,
@@ -474,7 +476,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
                 if (alloctype == VDEV_ALLOC_LOAD) {
                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-                           &vd->vdev_dtl.smo_object);
+                           &vd->vdev_dtl_smo.smo_object);
                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
                             &vd->vdev_unspare);
                 }
@@ -566,12 +568,14 @@ vdev_free(vdev_t *vd)
  
         txg_list_destroy(&vd->vdev_ms_list);
         txg_list_destroy(&vd->vdev_dtl_list);
+
         mutex_enter(&vd->vdev_dtl_lock);
-       space_map_unload(&vd->vdev_dtl_map);
-       space_map_destroy(&vd->vdev_dtl_map);
-       space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-       space_map_destroy(&vd->vdev_dtl_scrub);
+       for (int t = 0; t < DTL_TYPES; t++) {
+               space_map_unload(&vd->vdev_dtl[t]);
+               space_map_destroy(&vd->vdev_dtl[t]);
+       }
         mutex_exit(&vd->vdev_dtl_lock);
+
         mutex_destroy(&vd->vdev_dtl_lock);
         mutex_destroy(&vd->vdev_stat_lock);
         mutex_destroy(&vd->vdev_probe_lock);
@@ -709,14 +713,18 @@ vdev_remove_parent(vdev_t *cvd)
  
         vdev_remove_child(mvd, cvd);
         vdev_remove_child(pvd, mvd);
+
         /*
          * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
          * Otherwise, we could have detached an offline device, and when we
          * go to import the pool we'll think we have two top-level vdevs,
          * instead of a different version of the same top-level vdev.
          */
-       if (mvd->vdev_top == mvd)
-               cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid;
+       if (mvd->vdev_top == mvd) {
+               uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+               cvd->vdev_guid += guid_delta;
+               cvd->vdev_guid_sum += guid_delta;
+       }
         cvd->vdev_id = mvd->vdev_id;
         vdev_add_child(pvd, cvd);
         vdev_top_update(cvd->vdev_top, cvd->vdev_top);
@@ -815,6 +823,7 @@ typedef struct vdev_probe_stats {
  static void
  vdev_probe_done(zio_t *zio)
  {
+       spa_t *spa = zio->io_spa;
         vdev_probe_stats_t *vps = zio->io_private;
         vdev_t *vd = vps->vps_vd;
  
@@ -822,7 +831,7 @@ vdev_probe_done(zio_t *zio)
                 ASSERT(zio->io_vd == vd);
                 if (zio->io_error == 0)
                         vps->vps_readable = 1;
-               if (zio->io_error == 0 && (spa_mode & FWRITE)) {
+               if (zio->io_error == 0 && spa_writeable(spa)) {
                         zio_nowait(zio_write_phys(vps->vps_root, vd,
                             zio->io_offset, zio->io_size, zio->io_data,
                             ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
@@ -843,12 +852,12 @@ vdev_probe_done(zio_t *zio)
                 vd->vdev_cant_write |= !vps->vps_writeable;
  
                 if (vdev_readable(vd) &&
-                   (vdev_writeable(vd) || !(spa_mode & FWRITE))) {
+                   (vdev_writeable(vd) || !spa_writeable(spa))) {
                         zio->io_error = 0;
                 } else {
                         ASSERT(zio->io_error != 0);
                         zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
-                           zio->io_spa, vd, NULL, 0, 0);
+                           spa, vd, NULL, 0, 0);
                         zio->io_error = ENXIO;
                 }
                 kmem_free(vps, sizeof (*vps));
@@ -916,12 +925,15 @@ vdev_probe(vdev_t *vd, zio_t *pio)
  int
  vdev_open(vdev_t *vd)
  {
+       spa_t *spa = vd->vdev_spa;
         int error;
         int c;
         uint64_t osize = 0;
         uint64_t asize, psize;
         uint64_t ashift = 0;
  
+       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
         ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
             vd->vdev_state == VDEV_STATE_CANT_OPEN ||
             vd->vdev_state == VDEV_STATE_OFFLINE);
@@ -1055,16 +1067,12 @@ vdev_open(vdev_t *vd)
  
         /*
          * If a leaf vdev has a DTL, and seems healthy, then kick off a
-        * resilver.  But don't do this if we are doing a reopen for a
-        * scrub, since this would just restart the scrub we are already
-        * doing.
+        * resilver.  But don't do this if we are doing a reopen for a scrub,
+        * since this would just restart the scrub we are already doing.
          */
-       if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
-               mutex_enter(&vd->vdev_dtl_lock);
-               if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
-                       spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
-               mutex_exit(&vd->vdev_dtl_lock);
-       }
+       if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
+           vdev_resilver_needed(vd, NULL, NULL))
+               spa_async_request(spa, SPA_ASYNC_RESILVER);
  
         return (0);
  }
@@ -1165,6 +1173,10 @@ vdev_validate(vdev_t *vd)
  void
  vdev_close(vdev_t *vd)
  {
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
         vd->vdev_ops->vdev_op_close(vd);
  
         vdev_cache_purge(vd);
@@ -1283,34 +1295,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
         (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
  }
  
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication.  There are three kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ *     scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ *     txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ *     persistent errors or just some device being offline.
+ *     Unlike the other three, the DTL_OUTAGE map is not generally
+ *     maintained; it's only computed when needed, typically to
+ *     determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read.  For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk.  When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
  void
-vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
  {
+       space_map_t *sm = &vd->vdev_dtl[t];
+
+       ASSERT(t < DTL_TYPES);
+       ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
         mutex_enter(sm->sm_lock);
         if (!space_map_contains(sm, txg, size))
                 space_map_add(sm, txg, size);
         mutex_exit(sm->sm_lock);
  }
  
-int
-vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
  {
-       int dirty;
+       space_map_t *sm = &vd->vdev_dtl[t];
+       boolean_t dirty = B_FALSE;
  
-       /*
-        * Quick test without the lock -- covers the common case that
-        * there are no dirty time segments.
-        */
-       if (sm->sm_space == 0)
-               return (0);
+       ASSERT(t < DTL_TYPES);
+       ASSERT(vd != vd->vdev_spa->spa_root_vdev);
  
         mutex_enter(sm->sm_lock);
-       dirty = space_map_contains(sm, txg, size);
+       if (sm->sm_space != 0)
+               dirty = space_map_contains(sm, txg, size);
         mutex_exit(sm->sm_lock);
  
         return (dirty);
  }
  
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+       space_map_t *sm = &vd->vdev_dtl[t];
+       boolean_t empty;
+
+       mutex_enter(sm->sm_lock);
+       empty = (sm->sm_space == 0);
+       mutex_exit(sm->sm_lock);
+
+       return (empty);
+}
+
  /*
   * Reassess DTLs after a config change or scrub completion.
   */
@@ -1318,11 +1384,19 @@ void
  vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
  {
         spa_t *spa = vd->vdev_spa;
-       int c;
+       avl_tree_t reftree;
+       int minref;
  
-       ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
  
-       if (vd->vdev_children == 0) {
+       for (int c = 0; c < vd->vdev_children; c++)
+               vdev_dtl_reassess(vd->vdev_child[c], txg,
+                   scrub_txg, scrub_done);
+
+       if (vd == spa->spa_root_vdev)
+               return;
+
+       if (vd->vdev_ops->vdev_op_leaf) {
                 mutex_enter(&vd->vdev_dtl_lock);
                 if (scrub_txg != 0 &&
                     (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
@@ -1333,12 +1407,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                          * will be valid, so excise the old region and
                          * fold in the scrub dtl.  Otherwise, leave the
                          * dtl as-is if there was an error.
+                        *
+                        * There's little trick here: to excise the beginning
+                        * of the DTL_MISSING map, we put it into a reference
+                        * tree and then add a segment with refcnt -1 that
+                        * covers the range [0, scrub_txg).  This means
+                        * that each txg in that range has refcnt -1 or 0.
+                        * We then add DTL_SCRUB with a refcnt of 2, so that
+                        * entries in the range [0, scrub_txg) will have a
+                        * positive refcnt -- either 1 or 2.  We then convert
+                        * the reference tree into the new DTL_MISSING map.
                          */
-                       space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
-                       space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+                       space_map_ref_create(&reftree);
+                       space_map_ref_add_map(&reftree,
+                           &vd->vdev_dtl[DTL_MISSING], 1);
+                       space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
+                       space_map_ref_add_map(&reftree,
+                           &vd->vdev_dtl[DTL_SCRUB], 2);
+                       space_map_ref_generate_map(&reftree,
+                           &vd->vdev_dtl[DTL_MISSING], 1);
+                       space_map_ref_destroy(&reftree);
                 }
+               space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+               space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+                   space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
                 if (scrub_done)
-                       space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+                       space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+               space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+               if (!vdev_readable(vd))
+                       space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+               else
+                       space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+                           space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
                 mutex_exit(&vd->vdev_dtl_lock);
  
                 if (txg != 0)
@@ -1346,35 +1446,34 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                 return;
         }
  
-       /*
-        * Make sure the DTLs are always correct under the scrub lock.
-        */
-       if (vd == spa->spa_root_vdev)
-               mutex_enter(&spa->spa_scrub_lock);
-
         mutex_enter(&vd->vdev_dtl_lock);
-       space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
-       space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-       mutex_exit(&vd->vdev_dtl_lock);
-
-       for (c = 0; c < vd->vdev_children; c++) {
-               vdev_t *cvd = vd->vdev_child[c];
-               vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
-               mutex_enter(&vd->vdev_dtl_lock);
-               space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
-               space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
-               mutex_exit(&vd->vdev_dtl_lock);
+       for (int t = 0; t < DTL_TYPES; t++) {
+               if (t == DTL_SCRUB)
+                       continue;                       /* leaf vdevs only */
+               if (t == DTL_PARTIAL)
+                       minref = 1;                     /* i.e. non-zero */
+               else if (vd->vdev_nparity != 0)
+                       minref = vd->vdev_nparity + 1;  /* RAID-Z */
+               else
+                       minref = vd->vdev_children;     /* any kind of mirror */
+               space_map_ref_create(&reftree);
+               for (int c = 0; c < vd->vdev_children; c++) {
+                       vdev_t *cvd = vd->vdev_child[c];
+                       mutex_enter(&cvd->vdev_dtl_lock);
+                       space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+                       mutex_exit(&cvd->vdev_dtl_lock);
+               }
+               space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
+               space_map_ref_destroy(&reftree);
         }
-
-       if (vd == spa->spa_root_vdev)
-               mutex_exit(&spa->spa_scrub_lock);
+       mutex_exit(&vd->vdev_dtl_lock);
  }
  
  static int
  vdev_dtl_load(vdev_t *vd)
  {
         spa_t *spa = vd->vdev_spa;
-       space_map_obj_t *smo = &vd->vdev_dtl;
+       space_map_obj_t *smo = &vd->vdev_dtl_smo;
         objset_t *mos = spa->spa_meta_objset;
         dmu_buf_t *db;
         int error;
@@ -1392,7 +1491,8 @@ vdev_dtl_load(vdev_t *vd)
         dmu_buf_rele(db, FTAG);
  
         mutex_enter(&vd->vdev_dtl_lock);
-       error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+       error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
+           NULL, SM_ALLOC, smo, mos);
         mutex_exit(&vd->vdev_dtl_lock);
  
         return (error);
@@ -1402,8 +1502,8 @@ void
  vdev_dtl_sync(vdev_t *vd, uint64_t txg)
  {
         spa_t *spa = vd->vdev_spa;
-       space_map_obj_t *smo = &vd->vdev_dtl;
-       space_map_t *sm = &vd->vdev_dtl_map;
+       space_map_obj_t *smo = &vd->vdev_dtl_smo;
+       space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
         objset_t *mos = spa->spa_meta_objset;
         space_map_t smsync;
         kmutex_t smlock;
@@ -1461,6 +1561,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
  }
  
  /*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+       vdev_t *tvd = vd->vdev_top;
+       uint8_t cant_read = vd->vdev_cant_read;
+       boolean_t required;
+
+       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+       if (vd == spa->spa_root_vdev || vd == tvd)
+               return (B_TRUE);
+
+       /*
+        * Temporarily mark the device as unreadable, and then determine
+        * whether this results in any DTL outages in the top-level vdev.
+        * If not, we can safely offline/detach/remove the device.
+        */
+       vd->vdev_cant_read = B_TRUE;
+       vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+       required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+       vd->vdev_cant_read = cant_read;
+       vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+
+       return (required);
+}
+
+/*
   * Determine if resilver is needed, and if so the txg range.
   */
  boolean_t
@@ -1472,19 +1603,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
  
         if (vd->vdev_children == 0) {
                 mutex_enter(&vd->vdev_dtl_lock);
-               if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
+               if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+                   vdev_writeable(vd)) {
                         space_seg_t *ss;
  
-                       ss = avl_first(&vd->vdev_dtl_map.sm_root);
+                       ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
                         thismin = ss->ss_start - 1;
-                       ss = avl_last(&vd->vdev_dtl_map.sm_root);
+                       ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
                         thismax = ss->ss_end;
                         needed = B_TRUE;
                 }
                 mutex_exit(&vd->vdev_dtl_lock);
         } else {
-               int c;
-               for (c = 0; c < vd->vdev_children; c++) {
+               for (int c = 0; c < vd->vdev_children; c++) {
                         vdev_t *cvd = vd->vdev_child[c];
                         uint64_t cmin, cmax;
  
@@ -1506,12 +1637,10 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
  void
  vdev_load(vdev_t *vd)
  {
-       int c;
-
         /*
          * Recursively load all children.
          */
-       for (c = 0; c < vd->vdev_children; c++)
+       for (int c = 0; c < vd->vdev_children; c++)
                 vdev_load(vd->vdev_child[c]);
  
         /*
@@ -1731,11 +1860,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
             vd->vdev_parent->vdev_child[0] == vd)
                 vd->vdev_unspare = B_TRUE;
  
-       (void) spa_vdev_state_exit(spa, vd, 0);
-
-       VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
-       return (0);
+       return (spa_vdev_state_exit(spa, vd, 0));
  }
  
  int
@@ -1756,13 +1881,10 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
          */
         if (!vd->vdev_offline) {
                 /*
-                * If this device's top-level vdev has a non-empty DTL,
-                * don't allow the device to be offlined.
-                *
-                * XXX -- make this more precise by allowing the offline
-                * as long as the remaining devices don't have any DTL holes.
+                * If this device has the only valid copy of some data,
+                * don't allow it to be offlined.
                  */
-               if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+               if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
                         return (spa_vdev_state_exit(spa, NULL, EBUSY));
  
                 /*
@@ -1772,7 +1894,7 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
                  */
                 vd->vdev_offline = B_TRUE;
                 vdev_reopen(vd->vdev_top);
-               if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+               if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
                         vd->vdev_offline = B_FALSE;
                         vdev_reopen(vd->vdev_top);
                         return (spa_vdev_state_exit(spa, NULL, EBUSY));
@@ -1852,13 +1974,17 @@ vdev_writeable(vdev_t *vd)
  boolean_t
  vdev_allocatable(vdev_t *vd)
  {
+       uint64_t state = vd->vdev_state;
+
         /*
-        * We currently allow allocations from vdevs which maybe in the
+        * We currently allow allocations from vdevs which may be in the
          * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
          * fails to reopen then we'll catch it later when we're holding
-        * the proper locks.
+        * the proper locks.  Note that we have to get the vdev state
+        * in a local variable because although it changes atomically,
+        * we're asking two separate questions about it.
          */
-       return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) &&
+       return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
             !vd->vdev_cant_write);
  }
  
@@ -1928,7 +2054,8 @@ vdev_clear_stats(vdev_t *vd)
  void
  vdev_stat_update(zio_t *zio, uint64_t psize)
  {
-       vdev_t *rvd = zio->io_spa->spa_root_vdev;
+       spa_t *spa = zio->io_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
         vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
         vdev_t *pvd;
         uint64_t txg = zio->io_txg;
@@ -1961,21 +2088,23 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                         return;
  
                 ASSERT(vd == zio->io_vd);
-               if (!(flags & ZIO_FLAG_IO_BYPASS)) {
-                       mutex_enter(&vd->vdev_stat_lock);
-                       vs->vs_ops[type]++;
-                       vs->vs_bytes[type] += psize;
-                       mutex_exit(&vd->vdev_stat_lock);
-               }
+
+               if (flags & ZIO_FLAG_IO_BYPASS)
+                       return;
+
+               mutex_enter(&vd->vdev_stat_lock);
+
                 if (flags & ZIO_FLAG_IO_REPAIR) {
-                       ASSERT(zio->io_delegate_list == NULL);
-                       mutex_enter(&vd->vdev_stat_lock);
                         if (flags & ZIO_FLAG_SCRUB_THREAD)
                                 vs->vs_scrub_repaired += psize;
-                       else
+                       if (flags & ZIO_FLAG_SELF_HEAL)
                                 vs->vs_self_healed += psize;
-                       mutex_exit(&vd->vdev_stat_lock);
                 }
+
+               vs->vs_ops[type]++;
+               vs->vs_bytes[type] += psize;
+
+               mutex_exit(&vd->vdev_stat_lock);
                 return;
         }
  
@@ -1993,19 +2122,39 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                 vs->vs_write_errors++;
         mutex_exit(&vd->vdev_stat_lock);
  
-       if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) {
-               if (flags & ZIO_FLAG_SCRUB_THREAD) {
-                       ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-                       for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-                               vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
-               }
-               if (!(flags & ZIO_FLAG_IO_REPAIR)) {
-                       if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+       if (type == ZIO_TYPE_WRITE && txg != 0 &&
+           (!(flags & ZIO_FLAG_IO_REPAIR) ||
+           (flags & ZIO_FLAG_SCRUB_THREAD))) {
+               /*
+                * This is either a normal write (not a repair), or it's a
+                * repair induced by the scrub thread.  In the normal case,
+                * we commit the DTL change in the same txg as the block
+                * was born.  In the scrub-induced repair case, we know that
+                * scrubs run in first-pass syncing context, so we commit
+                * the DTL change in spa->spa_syncing_txg.
+                *
+                * We currently do not make DTL entries for failed spontaneous
+                * self-healing writes triggered by normal (non-scrubbing)
+                * reads, because we have no transactional context in which to
+                * do so -- and it's not clear that it'd be desirable anyway.
+                */
+               if (vd->vdev_ops->vdev_op_leaf) {
+                       uint64_t commit_txg = txg;
+                       if (flags & ZIO_FLAG_SCRUB_THREAD) {
+                               ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+                               ASSERT(spa_sync_pass(spa) == 1);
+                               vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+                               commit_txg = spa->spa_syncing_txg;
+                       }
+                       ASSERT(commit_txg >= spa->spa_syncing_txg);
+                       if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
                                 return;
-                       vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-                       for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-                               vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+                       for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+                               vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+                       vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
                 }
+               if (vd != rvd)
+                       vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
         }
  }
  
@@ -2218,7 +2367,8 @@ vdev_state_clean(vdev_t *vd)
  void
  vdev_propagate_state(vdev_t *vd)
  {
-       vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+       spa_t *spa = vd->vdev_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
         int degraded = 0, faulted = 0;
         int corrupted = 0;
         int c;
@@ -2229,7 +2379,7 @@ vdev_propagate_state(vdev_t *vd)
                         child = vd->vdev_child[c];
  
                         if (!vdev_readable(child) ||
-                           (!vdev_writeable(child) && (spa_mode & FWRITE))) {
+                           (!vdev_writeable(child) && spa_writeable(spa))) {
                                 /*
                                  * Root special: if there is a top-level log
                                  * device, treat the root vdev as if it were
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c

index dc0e920..f91dddb 100644 (file)
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -61,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
          */
         ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
         error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
-           spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+           spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
  
         if (error) {
                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -105,7 +105,8 @@ vdev_file_close(vdev_t *vd)
  
         if (vf->vf_vnode != NULL) {
                 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
-               (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
+               (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+                   kcred, NULL);
                 VN_RELE(vf->vf_vnode);
         }
  
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c

index bf93046..f8f9019 100644 (file)
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -277,9 +277,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                     vd->vdev_islog) == 0);
         }
  
-       if (vd->vdev_dtl.smo_object != 0)
+       if (vd->vdev_dtl_smo.smo_object != 0)
                 VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-                   vd->vdev_dtl.smo_object) == 0);
+                   vd->vdev_dtl_smo.smo_object) == 0);
  
         if (getstats) {
                 vdev_stat_t vs;
@@ -520,9 +520,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
             vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
                 return (EBUSY);
  
-       ASSERT(reason != VDEV_LABEL_REMOVE ||
-           vdev_inuse(vd, crtxg, reason, NULL, NULL));
-
         /*
          * If this is a request to add or replace a spare or l2cache device
          * that is in use elsewhere on the system, then we must update the
@@ -705,6 +702,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
   */
  
  /*
+ * For use by zdb and debugging purposes only
+ */
+uint64_t ub_max_txg = UINT64_MAX;
+
+/*
   * Consider the following situation: txg is safely synced to disk.  We've
   * written the first uberblock for txg + 1, and then we lose power.  When we
   * come back up, we fail to see the uberblock for txg + 1 because, say,
@@ -741,7 +743,8 @@ vdev_uberblock_load_done(zio_t *zio)
  
         if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
                 mutex_enter(&rio->io_lock);
-               if (vdev_uberblock_compare(ub, ubbest) > 0)
+               if (ub->ub_txg <= ub_max_txg &&
+                   vdev_uberblock_compare(ub, ubbest) > 0)
                         *ubbest = *ub;
                 mutex_exit(&rio->io_lock);
         }
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c

index c4629ff..184da82 100644 (file)
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -225,7 +225,7 @@ vdev_mirror_child_select(zio_t *zio)
                         mc->mc_skipped = 1;
                         continue;
                 }
-               if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+               if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
                         return (c);
                 mc->mc_error = ESTALE;
                 mc->mc_skipped = 1;
@@ -282,20 +282,10 @@ vdev_mirror_io_start(zio_t *zio)
                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
  
                 /*
-                * If this is a resilvering I/O to a replacing vdev,
-                * only the last child should be written -- unless the
-                * first child happens to have a DTL entry here as well.
-                * All other writes go to all children.
+                * Writes go to all children.
                  */
-               if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
-                   !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
-                   zio->io_txg, 1)) {
-                       c = mm->mm_children - 1;
-                       children = 1;
-               } else {
-                       c = 0;
-                       children = mm->mm_children;
-               }
+               c = 0;
+               children = mm->mm_children;
         }
  
         while (children--) {
@@ -398,7 +388,7 @@ vdev_mirror_io_done(zio_t *zio)
                 ASSERT(zio->io_error != 0);
         }
  
-       if (good_copies && (spa_mode & FWRITE) &&
+       if (good_copies && spa_writeable(zio->io_spa) &&
             (unexpected_errors ||
             (zio->io_flags & ZIO_FLAG_RESILVER) ||
             ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
@@ -419,7 +409,7 @@ vdev_mirror_io_done(zio_t *zio)
                                 if (mc->mc_tried)
                                         continue;
                                 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-                                   !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+                                   !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
                                     zio->io_txg, 1))
                                         continue;
                                 mc->mc_error = ESTALE;
@@ -429,7 +419,8 @@ vdev_mirror_io_done(zio_t *zio)
                             mc->mc_vd, mc->mc_offset,
                             zio->io_data, zio->io_size,
                             ZIO_TYPE_WRITE, zio->io_priority,
-                           ZIO_FLAG_IO_REPAIR, NULL, NULL));
+                           ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+                           ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
                 }
         }
  }
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c

index 46fca0e..137afdd 100644 (file)
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -176,6 +176,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
         zio_t *fio, *lio, *aio, *dio;
         avl_tree_t *tree;
         uint64_t size;
+       int flags;
  
         ASSERT(MUTEX_HELD(&vq->vq_lock));
  
@@ -187,21 +188,32 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
  
         tree = fio->io_vdev_tree;
         size = fio->io_size;
-
-       while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
-           !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-           size + dio->io_size <= zfs_vdev_aggregation_limit) {
-               dio->io_delegate_next = fio;
-               fio = dio;
-               size += dio->io_size;
-       }
-
-       while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
-           !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-           size + dio->io_size <= zfs_vdev_aggregation_limit) {
-               lio->io_delegate_next = dio;
-               lio = dio;
-               size += dio->io_size;
+       flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
+
+       if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
+               /*
+                * We can aggregate I/Os that are adjacent and of the
+                * same flavor, as expressed by the AGG_INHERIT flags.
+                * The latter is necessary so that certain attributes
+                * of the I/O, such as whether it's a normal I/O or a
+                * scrub/resilver, can be preserved in the aggregate.
+                */
+               while ((dio = AVL_PREV(tree, fio)) != NULL &&
+                   IS_ADJACENT(dio, fio) &&
+                   (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+                   size + dio->io_size <= zfs_vdev_aggregation_limit) {
+                       dio->io_delegate_next = fio;
+                       fio = dio;
+                       size += dio->io_size;
+               }
+               while ((dio = AVL_NEXT(tree, lio)) != NULL &&
+                   IS_ADJACENT(lio, dio) &&
+                   (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+                   size + dio->io_size <= zfs_vdev_aggregation_limit) {
+                       lio->io_delegate_next = dio;
+                       lio = dio;
+                       size += dio->io_size;
+               }
         }
  
         if (fio != lio) {
@@ -212,7 +224,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
  
                 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
                     buf, size, fio->io_type, ZIO_PRIORITY_NOW,
-                   ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+                   flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
                     vdev_queue_agg_io_done, NULL);
  
                 aio->io_delegate_list = fio;
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c

index 69e3144..ad997f5 100644 (file)
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -687,7 +687,7 @@ vdev_raidz_io_start(zio_t *zio)
                         rc->rc_skipped = 1;
                         continue;
                 }
-               if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+               if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
                         if (c >= rm->rm_firstdatacol)
                                 rm->rm_missingdata++;
                         else
@@ -1165,7 +1165,7 @@ vdev_raidz_io_done(zio_t *zio)
  done:
         zio_checksum_verified(zio);
  
-       if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+       if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
                 /*
                  * Use the good data we have in hand to repair damaged children.
@@ -1180,7 +1180,8 @@ done:
                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
                             rc->rc_offset, rc->rc_data, rc->rc_size,
                             ZIO_TYPE_WRITE, zio->io_priority,
-                           ZIO_FLAG_IO_REPAIR, NULL, NULL));
+                           ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+                           ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
                 }
         }
  }
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c

index 341dc4d..fdf92a1 100644 (file)
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@@ -2148,12 +2148,12 @@ top:
                 }
         }
  
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 mutex_exit(&zp->z_acl_lock);
                 mutex_exit(&zp->z_lock);
  
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -2208,7 +2208,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
  
         *check_privs = B_TRUE;
  
-       if (zfsvfs->z_assign >= TXG_INITIAL) {          /* ZIL replay */
+       if (zfsvfs->z_replay) {
                 *working_mode = 0;
                 return (0);
         }
diff --git a/module/zfs/zfs_byteswap.c b/module/zfs/zfs_byteswap.c

index ab97f83..cd36696 100644 (file)
--- a/module/zfs/zfs_byteswap.c
+++ b/module/zfs/zfs_byteswap.c
@@ -19,12 +19,10 @@
   * CDDL HEADER END
   */
  /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
  #include <sys/zfs_context.h>
  #include <sys/vfs.h>
  #include <sys/fs/zfs.h>
@@ -63,6 +61,20 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
  
         while (ptr < end) {
                 if (zfs_layout) {
+                       /*
+                        * Avoid overrun.  Embedded aces can have one
+                        * of several sizes.  We don't know exactly
+                        * how many our present, only the size of the
+                        * buffer containing them.  That size may be
+                        * larger than needed to hold the aces
+                        * present.  As long as we do not do any
+                        * swapping beyond the end of our block we are
+                        * okay.  It it safe to swap any non-ace data
+                        * within the block since it is just zeros.
+                        */
+                       if (ptr + sizeof (zfs_ace_hdr_t) > end) {
+                               break;
+                       }
                         zacep = (zfs_ace_t *)ptr;
                         zacep->z_hdr.z_access_mask =
                             BSWAP_32(zacep->z_hdr.z_access_mask);
@@ -71,6 +83,10 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
                             BSWAP_16(zacep->z_hdr.z_type);
                         entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
                 } else {
+                       /* Overrun avoidance */
+                       if (ptr + sizeof (ace_t) > end) {
+                               break;
+                       }
                         acep = (ace_t *)ptr;
                         acep->a_access_mask = BSWAP_32(acep->a_access_mask);
                         acep->a_flags = BSWAP_16(acep->a_flags);
@@ -87,8 +103,14 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
                         break;
                 case ACE_IDENTIFIER_GROUP:
                 default:
+                       /* Overrun avoidance */
                         if (zfs_layout) {
-                               zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+                               if (ptr + sizeof (zfs_ace_t) <= end) {
+                                       zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+                               } else {
+                                       entry_size = sizeof (zfs_ace_t);
+                                       break;
+                               }
                         }
                         switch (ace_type) {
                         case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@@ -169,7 +191,8 @@ zfs_znode_byteswap(void *buf, size_t size)
         if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
                 zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
                     ZFS_ACE_SPACE);
-       } else
+       } else {
                 zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
                     ACE_SLOT_CNT);
+       }
  }
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c

index 1ec4932..9353d01 100644 (file)
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -562,24 +562,6 @@ zfs_rmnode(znode_t *zp)
         ASSERT(zp->z_phys->zp_links == 0);
  
         /*
-        * If this is a ZIL replay then leave the object in the unlinked set.
-        * Otherwise we can get a deadlock, because the delete can be
-        * quite large and span multiple tx's and txgs, but each replay
-        * creates a tx to atomically run the replay function and mark the
-        * replay record as complete. We deadlock trying to start a tx in
-        * a new txg to further the deletion but can't because the replay
-        * tx hasn't finished.
-        *
-        * We actually delete the object if we get a failure to create an
-        * object in zil_replay_log_record(), or after calling zil_replay().
-        */
-       if (zfsvfs->z_assign >= TXG_INITIAL) {
-               zfs_znode_dmu_fini(zp);
-               zfs_znode_free(zp);
-               return;
-       }
-
-       /*
          * If this is an attribute directory, purge its contents.
          */
         if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
@@ -845,9 +827,9 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
                             FUID_SIZE_ESTIMATE(zfsvfs));
                 }
         }
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+               if (error == ERESTART)
                         dmu_tx_wait(tx);
                 dmu_tx_abort(tx);
                 return (error);
@@ -930,7 +912,7 @@ top:
         error = zfs_make_xattrdir(zp, &va, xvpp, cr);
         zfs_dirent_unlock(dl);
  
-       if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+       if (error == ERESTART) {
                 /* NB: we already did dmu_tx_wait() if necessary */
                 goto top;
         }
@@ -959,7 +941,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
         uid_t           fowner;
         zfsvfs_t        *zfsvfs = zdp->z_zfsvfs;
  
-       if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)     /* ZIL replay */
+       if (zdp->z_zfsvfs->z_replay)
                 return (0);
  
         if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c

index 7cb5052..286dafb 100644 (file)
--- a/module/zfs/zfs_fuid.c
+++ b/module/zfs/zfs_fuid.c
@@ -519,7 +519,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
         uint32_t rid;
         idmap_stat status;
         uint64_t idx;
-       boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
         zfs_fuid_t *zfuid = NULL;
         zfs_fuid_info_t *fuidp;
  
@@ -534,7 +533,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
         if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
                 return (id);
  
-       if (is_replay) {
+       if (zfsvfs->z_replay) {
                 fuidp = zfsvfs->z_fuid_replay;
  
                 /*
@@ -584,7 +583,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
  
         idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
  
-       if (!is_replay)
+       if (!zfsvfs->z_replay)
                 zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
         else if (zfuid != NULL) {
                 list_remove(&fuidp->z_fuids, zfuid);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c

index b6ad574..49ee552 100644 (file)
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -856,9 +856,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
  {
         int error;
         boolean_t force = (boolean_t)zc->zc_cookie;
+       boolean_t hardforce = (boolean_t)zc->zc_guid;
  
         zfs_log_history(zc);
-       error = spa_export(zc->zc_name, NULL, force);
+       error = spa_export(zc->zc_name, NULL, force, hardforce);
         return (error);
  }
  
@@ -1162,7 +1163,7 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
         if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
                 return (error);
  
-       error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
+       error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
  
         spa_close(spa, FTAG);
         return (error);
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c

index 11cd4c2..84d64b4 100644 (file)
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -45,13 +45,33 @@
  #include <sys/spa.h>
  #include <sys/zfs_fuid.h>
  #include <sys/ddi.h>
+#include <sys/dsl_dataset.h>
+
+#define        ZFS_HANDLE_REPLAY(zilog, tx) \
+       if (zilog->zl_replay) { \
+               dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
+               zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
+                   zilog->zl_replaying_seq; \
+               return; \
+       }
  
  /*
- * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * an intent log transaction
- * structure (itx_t) and save within it all the information necessary to
- * possibly replay the transaction. The itx is then assigned a sequence
- * number and inserted in the in-memory list anchored in the zilog.
+ * These zfs_log_* functions must be called within a dmu tx, in one
+ * of 2 contexts depending on zilog->z_replay:
+ *
+ * Non replay mode
+ * ---------------
+ * We need to record the transaction so that if it is committed to
+ * the Intent Log then it can be replayed.  An intent log transaction
+ * structure (itx_t) is allocated and all the information necessary to
+ * possibly replay the transaction is saved in it. The itx is then assigned
+ * a sequence number and inserted in the in-memory list anchored in the zilog.
+ *
+ * Replay mode
+ * -----------
+ * We need to mark the intent log record as replayed in the log header.
+ * This is done in the same transaction as the replay so that they
+ * commit atomically.
   */
  
  int
@@ -231,6 +251,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
         if (zilog == NULL)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         /*
          * If we have FUIDs present then add in space for
          * domains and ACE fuid's if any.
@@ -334,6 +356,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
         if (zilog == NULL)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
         lr = (lr_remove_t *)&itx->itx_lr;
         lr->lr_doid = dzp->z_id;
@@ -358,6 +382,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
         if (zilog == NULL)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
         lr = (lr_link_t *)&itx->itx_lr;
         lr->lr_doid = dzp->z_id;
@@ -385,6 +411,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
         if (zilog == NULL)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
         lr = (lr_create_t *)&itx->itx_lr;
         lr->lr_doid = dzp->z_id;
@@ -419,6 +447,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
         if (zilog == NULL)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
         lr = (lr_rename_t *)&itx->itx_lr;
         lr->lr_sdoid = sdzp->z_id;
@@ -451,6 +481,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
         if (zilog == NULL || zp->z_unlinked)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         /*
          * Writes are handled in three different ways:
          *
@@ -549,6 +581,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
         if (zilog == NULL || zp->z_unlinked)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         itx = zil_itx_create(txtype, sizeof (*lr));
         lr = (lr_truncate_t *)&itx->itx_lr;
         lr->lr_foid = zp->z_id;
@@ -578,6 +612,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
         if (zilog == NULL || zp->z_unlinked)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         /*
          * If XVATTR set, then log record size needs to allow
          * for lr_attr_t + xvattr mask, mapsize and create time
@@ -644,6 +680,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
         if (zilog == NULL || zp->z_unlinked)
                 return;
  
+       ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
         txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
             TX_ACL_V0 : TX_ACL;
  
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c

index 06b4dee..1bf1bc5 100644 (file)
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -583,21 +583,50 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
                  * allow replays to succeed.
                  */
                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-               zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-
-               /*
-                * Parse and replay the intent log.
-                */
-               zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-                   zfs_replay_vector, zfs_unlinked_drain);
+               if (readonly != 0)
+                       zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+               else
+                       zfs_unlinked_drain(zfsvfs);
  
-               zfs_unlinked_drain(zfsvfs);
+               zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+               if (zil_disable) {
+                       zil_destroy(zfsvfs->z_log, 0);
+                       zfsvfs->z_log = NULL;
+               } else {
+                       /*
+                        * Parse and replay the intent log.
+                        *
+                        * Because of ziltest, this must be done after
+                        * zfs_unlinked_drain().  (Further note: ziltest
+                        * doesn't use readonly mounts, where
+                        * zfs_unlinked_drain() isn't called.)  This is because
+                        * ziltest causes spa_sync() to think it's committed,
+                        * but actually it is not, so the intent log contains
+                        * many txg's worth of changes.
+                        *
+                        * In particular, if object N is in the unlinked set in
+                        * the last txg to actually sync, then it could be
+                        * actually freed in a later txg and then reallocated
+                        * in a yet later txg.  This would write a "create
+                        * object N" record to the intent log.  Normally, this
+                        * would be fine because the spa_sync() would have
+                        * written out the fact that object N is free, before
+                        * we could write the "create object N" intent log
+                        * record.
+                        *
+                        * But when we are in ziltest mode, we advance the "open
+                        * txg" without actually spa_sync()-ing the changes to
+                        * disk.  So we would see that object N is still
+                        * allocated and in the unlinked set, and there is an
+                        * intent log record saying to allocate it.
+                        */
+                       zfsvfs->z_replay = B_TRUE;
+                       zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
+                       zfsvfs->z_replay = B_FALSE;
+               }
                 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
         }
  
-       if (!zil_disable)
-               zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
         return (0);
  }
  
@@ -634,7 +663,6 @@ zfs_domount(vfs_t *vfsp, char *osname)
         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
         zfsvfs->z_vfs = vfsp;
         zfsvfs->z_parent = zfsvfs;
-       zfsvfs->z_assign = TXG_NOWAIT;
         zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
  
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index 8e0037e..f62d3bf 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -105,9 +105,7 @@
   *  (3)        All range locks must be grabbed before calling dmu_tx_assign(),
   *     as they can span dmu_tx_assign() calls.
   *
- *  (4)        Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
- *     In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
- *     it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
+ *  (4)        Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
   *     This is critical because we don't want to block while holding locks.
   *     Note, in particular, that if a lock is sometimes acquired before
   *     the tx assigns, and sometimes after (e.g. z_lock), then failing to
@@ -124,6 +122,8 @@
   *  (5)        If the operation succeeded, generate the intent log entry for it
   *     before dropping locks.  This ensures that the ordering of events
   *     in the intent log matches the order in which they actually occurred.
+ *      During ZIL replay the zfs_log_* functions will update the sequence
+ *     number to indicate the zil transaction has replayed.
   *
   *  (6)        At the end of each vnode op, the DMU tx must always commit,
   *     regardless of whether there were any errors.
@@ -139,12 +139,12 @@
   *     rw_enter(...);                  // grab any other locks you need
   *     tx = dmu_tx_create(...);        // get DMU tx
   *     dmu_tx_hold_*();                // hold each object you might modify
- *     error = dmu_tx_assign(tx, zfsvfs->z_assign);    // try to assign
+ *     error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
   *     if (error) {
   *             rw_exit(...);           // drop locks
   *             zfs_dirent_unlock(dl);  // unlock directory entry
   *             VN_RELE(...);           // release held vnodes
- *             if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ *             if (error == ERESTART) {
   *                     dmu_tx_wait(tx);
   *                     dmu_tx_abort(tx);
   *                     goto top;
@@ -698,10 +698,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
                 tx = dmu_tx_create(zfsvfs->z_os);
                 dmu_tx_hold_bonus(tx, zp->z_id);
                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-               error = dmu_tx_assign(tx, zfsvfs->z_assign);
+               error = dmu_tx_assign(tx, TXG_NOWAIT);
                 if (error) {
-                       if (error == ERESTART &&
-                           zfsvfs->z_assign == TXG_NOWAIT) {
+                       if (error == ERESTART) {
                                 dmu_tx_wait(tx);
                                 dmu_tx_abort(tx);
                                 continue;
@@ -807,7 +806,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
          * If we're in replay mode, or we made no progress, return error.
          * Otherwise, it's at least a partial write, so it's successful.
          */
-       if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+       if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
                 ZFS_EXIT(zfsvfs);
                 return (error);
         }
@@ -1233,11 +1232,10 @@ top:
                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                             0, SPA_MAXBLOCKSIZE);
                 }
-               error = dmu_tx_assign(tx, zfsvfs->z_assign);
+               error = dmu_tx_assign(tx, TXG_NOWAIT);
                 if (error) {
                         zfs_dirent_unlock(dl);
-                       if (error == ERESTART &&
-                           zfsvfs->z_assign == TXG_NOWAIT) {
+                       if (error == ERESTART) {
                                 dmu_tx_wait(tx);
                                 dmu_tx_abort(tx);
                                 goto top;
@@ -1449,11 +1447,11 @@ top:
         /* charge as an update -- would be nice not to charge at all */
         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
  
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 VN_RELE(vp);
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -1659,10 +1657,10 @@ top:
         if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                     0, SPA_MAXBLOCKSIZE);
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -1789,13 +1787,13 @@ top:
         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
         dmu_tx_hold_bonus(tx, zp->z_id);
         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 rw_exit(&zp->z_parent_lock);
                 rw_exit(&zp->z_name_lock);
                 zfs_dirent_unlock(dl);
                 VN_RELE(vp);
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -2342,6 +2340,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
         zilog_t         *zilog;
         dmu_tx_t        *tx;
         vattr_t         oldva;
+       xvattr_t        tmpxvattr;
         uint_t          mask = vap->va_mask;
         uint_t          saved_mask;
         int             trim_mask = 0;
@@ -2396,6 +2395,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
          */
         xoap = xva_getxoptattr(xvap);
  
+       xva_init(&tmpxvattr);
+
         /*
          * Immutable files can only alter immutable bit and atime
          */
@@ -2518,28 +2519,78 @@ top:
         oldva.va_mode = pzp->zp_mode;
         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
         if (mask & AT_XVATTR) {
-               if ((need_policy == FALSE) &&
-                   (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
-                   xoap->xoa_appendonly !=
-                   ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
-                   (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
-                   xoap->xoa_nounlink !=
-                   ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
-                   (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
-                   xoap->xoa_immutable !=
-                   ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
-                   (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
-                   xoap->xoa_nodump !=
-                   ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
-                   (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
-                   xoap->xoa_av_modified !=
-                   ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
-                   ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
-                   ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
-                   xoap->xoa_av_quarantined !=
-                   ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
-                   (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
-                   (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+               /*
+                * Update xvattr mask to include only those attributes
+                * that are actually changing.
+                *
+                * the bits will be restored prior to actually setting
+                * the attributes so the caller thinks they were set.
+                */
+               if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+                       if (xoap->xoa_appendonly !=
+                           ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+                               need_policy = TRUE;
+                       } else {
+                               XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+                               XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+                       }
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+                       if (xoap->xoa_nounlink !=
+                           ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+                               need_policy = TRUE;
+                       } else {
+                               XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+                               XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+                       }
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+                       if (xoap->xoa_immutable !=
+                           ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+                               need_policy = TRUE;
+                       } else {
+                               XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+                               XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+                       }
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+                       if (xoap->xoa_nodump !=
+                           ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+                               need_policy = TRUE;
+                       } else {
+                               XVA_CLR_REQ(xvap, XAT_NODUMP);
+                               XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+                       }
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+                       if (xoap->xoa_av_modified !=
+                           ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+                               need_policy = TRUE;
+                       } else {
+                               XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+                               XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+                       }
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+                       if ((vp->v_type != VREG &&
+                           xoap->xoa_av_quarantined) ||
+                           xoap->xoa_av_quarantined !=
+                           ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+                               need_policy = TRUE;
+                       } else {
+                               XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+                               XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+                       }
+               }
+
+               if (need_policy == FALSE &&
+                   (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+                   XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
                         need_policy = TRUE;
                 }
         }
@@ -2649,7 +2700,7 @@ top:
                 dmu_tx_hold_bonus(tx, attrzp->z_id);
         }
  
-       err = dmu_tx_assign(tx, zfsvfs->z_assign);
+       err = dmu_tx_assign(tx, TXG_NOWAIT);
         if (err) {
                 if (attrzp)
                         VN_RELE(ZTOV(attrzp));
@@ -2659,7 +2710,7 @@ top:
                         aclp = NULL;
                 }
  
-               if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (err == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -2732,6 +2783,31 @@ top:
          */
  
         if (xoap && (mask & AT_XVATTR)) {
+
+               /*
+                * restore trimmed off masks
+                * so that return masks can be set for caller.
+                */
+
+               if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+                       XVA_SET_REQ(xvap, XAT_APPENDONLY);
+               }
+               if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+                       XVA_SET_REQ(xvap, XAT_NOUNLINK);
+               }
+               if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+                       XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+               }
+               if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+                       XVA_SET_REQ(xvap, XAT_NODUMP);
+               }
+               if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+                       XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+               }
+               if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+                       XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+               }
+
                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
                         size_t len;
                         dmu_object_info_t doi;
@@ -3104,7 +3180,7 @@ top:
         if (tzp)
                 dmu_tx_hold_bonus(tx, tzp->z_id);       /* parent changes */
         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 if (zl != NULL)
                         zfs_rename_unlock(&zl);
@@ -3113,7 +3189,7 @@ top:
                 VN_RELE(ZTOV(szp));
                 if (tzp)
                         VN_RELE(ZTOV(tzp));
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -3242,10 +3318,10 @@ top:
                             FUID_SIZE_ESTIMATE(zfsvfs));
                 }
         }
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -3462,10 +3538,10 @@ top:
         tx = dmu_tx_create(zfsvfs->z_os);
         dmu_tx_hold_bonus(tx, szp->z_id);
         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -3547,7 +3623,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
         len = PAGESIZE;
         /*
          * If our blocksize is bigger than the page size, try to kluster
-        * muiltiple pages so that we write a full block (thus avoiding
+        * multiple pages so that we write a full block (thus avoiding
          * a read-modify-write).
          */
         if (off < filesz && zp->z_blksz > PAGESIZE) {
@@ -3589,9 +3665,9 @@ top:
         tx = dmu_tx_create(zfsvfs->z_os);
         dmu_tx_hold_write(tx, zp->z_id, off, len);
         dmu_tx_hold_bonus(tx, zp->z_id);
-       err = dmu_tx_assign(tx, zfsvfs->z_assign);
+       err = dmu_tx_assign(tx, TXG_NOWAIT);
         if (err != 0) {
-               if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (err == ERESTART) {
                         zfs_range_unlock(rl);
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c

index 25751ae..9a78603 100644 (file)
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -734,7 +734,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
  
         ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
  
-       if (zfsvfs->z_assign >= TXG_INITIAL) {          /* ZIL replay */
+       if (zfsvfs->z_replay) {
                 obj = vap->va_nodeid;
                 flag |= IS_REPLAY;
                 now = vap->va_ctime;            /* see zfs_replay_create() */
@@ -1254,9 +1254,9 @@ top:
                 newblksz = 0;
         }
  
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -1358,9 +1358,9 @@ zfs_trunc(znode_t *zp, uint64_t end)
  top:
         tx = dmu_tx_create(zfsvfs->z_os);
         dmu_tx_hold_bonus(tx, zp->z_id);
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -1456,9 +1456,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
  log:
         tx = dmu_tx_create(zfsvfs->z_os);
         dmu_tx_hold_bonus(tx, zp->z_id);
-       error = dmu_tx_assign(tx, zfsvfs->z_assign);
+       error = dmu_tx_assign(tx, TXG_NOWAIT);
         if (error) {
-               if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+               if (error == ERESTART) {
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto log;
@@ -1562,7 +1562,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
         bzero(&zfsvfs, sizeof (zfsvfs_t));
  
         zfsvfs.z_os = os;
-       zfsvfs.z_assign = TXG_NOWAIT;
         zfsvfs.z_parent = &zfsvfs;
         zfsvfs.z_version = version;
         zfsvfs.z_use_fuids = USE_FUIDS(version, os);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index 9510188..83fef0d 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -351,14 +351,20 @@ zil_create(zilog_t *zilog)
         blk = zh->zh_log;
  
         /*
-        * If we don't already have an initial log block, allocate one now.
+        * If we don't already have an initial log block or we have one
+        * but it's the wrong endianness then allocate one.
          */
-       if (BP_IS_HOLE(&blk)) {
+       if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
                 tx = dmu_tx_create(zilog->zl_os);
                 (void) dmu_tx_assign(tx, TXG_WAIT);
                 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
                 txg = dmu_tx_get_txg(tx);
  
+               if (!BP_IS_HOLE(&blk)) {
+                       zio_free_blk(zilog->zl_spa, &blk, txg);
+                       BP_ZERO(&blk);
+               }
+
                 error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
                     NULL, txg);
  
@@ -1214,7 +1220,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
  
         ASSERT(zilog->zl_stop_sync == 0);
  
-       zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+       zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
  
         if (zilog->zl_destroy_txg == txg) {
                 blkptr_t blk = zh->zh_log;
@@ -1223,7 +1229,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
                 ASSERT(spa_sync_pass(spa) == 1);
  
                 bzero(zh, sizeof (zil_header_t));
-               bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+               bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
  
                 if (zilog->zl_keep_first) {
                         /*
@@ -1460,9 +1466,7 @@ zil_resume(zilog_t *zilog)
  typedef struct zil_replay_arg {
         objset_t        *zr_os;
         zil_replay_func_t **zr_replay;
-       zil_replay_cleaner_t *zr_replay_cleaner;
         void            *zr_arg;
-       uint64_t        *zr_txgp;
         boolean_t       zr_byteswap;
         char            *zr_lrbuf;
  } zil_replay_arg_t;
@@ -1475,9 +1479,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
         uint64_t reclen = lr->lrc_reclen;
         uint64_t txtype = lr->lrc_txtype;
         char *name;
-       int pass, error, sunk;
+       int pass, error;
  
-       if (zilog->zl_stop_replay)
+       if (!zilog->zl_replay)                  /* giving up */
                 return;
  
         if (lr->lrc_txg < claim_txg)            /* already committed */
@@ -1489,6 +1493,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
         /* Strip case-insensitive bit, still present in log record */
         txtype &= ~TX_CI;
  
+       if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+               error = EINVAL;
+               goto bad;
+       }
+
         /*
          * Make a copy of the data so we can revise and extend it.
          */
@@ -1539,69 +1548,16 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
         }
  
         /*
-        * Replay of large truncates can end up needing additional txs
-        * and a different txg. If they are nested within the replay tx
-        * as below then a hang is possible. So we do the truncate here
-        * and redo the truncate later (a no-op) and update the sequence
-        * number whilst in the replay tx. Fortunately, it's safe to repeat
-        * a truncate if we crash and the truncate commits. A create over
-        * an existing file will also come in as a TX_TRUNCATE record.
-        *
-        * Note, remove of large files and renames over large files is
-        * handled by putting the deleted object on a stable list
-        * and if necessary force deleting the object outside of the replay
-        * transaction using the zr_replay_cleaner.
-        */
-       if (txtype == TX_TRUNCATE) {
-               *zr->zr_txgp = TXG_NOWAIT;
-               error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf,
-                   zr->zr_byteswap);
-               if (error)
-                       goto bad;
-               zr->zr_byteswap = 0; /* only byteswap once */
-       }
-
-       /*
          * We must now do two things atomically: replay this log record,
-        * and update the log header to reflect the fact that we did so.
-        * We use the DMU's ability to assign into a specific txg to do this.
+        * and update the log header sequence number to reflect the fact that
+        * we did so. At the end of each replay function the sequence number
+        * is updated if we are in replay mode.
          */
-       for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
-               uint64_t replay_txg;
-               dmu_tx_t *replay_tx;
-
-               replay_tx = dmu_tx_create(zr->zr_os);
-               error = dmu_tx_assign(replay_tx, TXG_WAIT);
-               if (error) {
-                       dmu_tx_abort(replay_tx);
-                       break;
-               }
-
-               replay_txg = dmu_tx_get_txg(replay_tx);
-
-               if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-                       error = EINVAL;
-               } else {
-                       /*
-                        * On the first pass, arrange for the replay vector
-                        * to fail its dmu_tx_assign().  That's the only way
-                        * to ensure that those code paths remain well tested.
-                        *
-                        * Only byteswap (if needed) on the 1st pass.
-                        */
-                       *zr->zr_txgp = replay_txg - (pass == 1);
-                       error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-                           zr->zr_byteswap && pass == 1);
-                       *zr->zr_txgp = TXG_NOWAIT;
-               }
-
-               if (error == 0) {
-                       dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
-                       zilog->zl_replay_seq[replay_txg & TXG_MASK] =
-                           lr->lrc_seq;
-               }
-
-               dmu_tx_commit(replay_tx);
+       for (pass = 1; pass <= 2; pass++) {
+               zilog->zl_replaying_seq = lr->lrc_seq;
+               /* Only byteswap (if needed) on the 1st pass.  */
+               error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+                   zr->zr_byteswap && pass == 1);
  
                 if (!error)
                         return;
@@ -1609,37 +1565,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
                 /*
                  * The DMU's dnode layer doesn't see removes until the txg
                  * commits, so a subsequent claim can spuriously fail with
-                * EEXIST. So if we receive any error other than ERESTART
-                * we try syncing out any removes then retrying the
-                * transaction.
+                * EEXIST. So if we receive any error we try syncing out
+                * any removes then retry the transaction.
                  */
-               if (error != ERESTART && !sunk) {
-                       if (zr->zr_replay_cleaner)
-                               zr->zr_replay_cleaner(zr->zr_arg);
+               if (pass == 1)
                         txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-                       sunk = B_TRUE;
-                       continue; /* retry */
-               }
-
-               if (error != ERESTART)
-                       break;
-
-               if (pass != 1)
-                       txg_wait_open(spa_get_dsl(zilog->zl_spa),
-                           replay_txg + 1);
-
-               dprintf("pass %d, retrying\n", pass);
         }
  
  bad:
-       ASSERT(error && error != ERESTART);
+       ASSERT(error);
         name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
         dmu_objset_name(zr->zr_os, name);
         cmn_err(CE_WARN, "ZFS replay transaction error %d, "
             "dataset %s, seq 0x%llx, txtype %llu %s\n",
             error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
             (lr->lrc_txtype & TX_CI) ? "CI" : "");
-       zilog->zl_stop_replay = 1;
+       zilog->zl_replay = B_FALSE;
         kmem_free(name, MAXNAMELEN);
  }
  
@@ -1654,9 +1595,7 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
   * If this dataset has a non-empty intent log, replay it and destroy it.
   */
  void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-       zil_replay_func_t *replay_func[TX_MAX_TYPE],
-       zil_replay_cleaner_t *replay_cleaner)
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
  {
         zilog_t *zilog = dmu_objset_zil(os);
         const zil_header_t *zh = zilog->zl_header;
@@ -1669,9 +1608,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
  
         zr.zr_os = os;
         zr.zr_replay = replay_func;
-       zr.zr_replay_cleaner = replay_cleaner;
         zr.zr_arg = arg;
-       zr.zr_txgp = txgp;
         zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
         zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
  
@@ -1680,7 +1617,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
          */
         txg_wait_synced(zilog->zl_dmu_pool, 0);
  
-       zilog->zl_stop_replay = 0;
+       zilog->zl_replay = B_TRUE;
         zilog->zl_replay_time = lbolt;
         ASSERT(zilog->zl_replay_blks == 0);
         (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
@@ -1689,6 +1626,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
  
         zil_destroy(zilog, B_FALSE);
         txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+       zilog->zl_replay = B_FALSE;
  }
  
  /*
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index d347920..62af799 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -767,7 +767,8 @@ zio_read_bp_init(zio_t *zio)
  {
         blkptr_t *bp = zio->io_bp;
  
-       if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
+       if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+           zio->io_logical == zio && !(zio->io_flags & ZIO_FLAG_RAW)) {
                 uint64_t csize = BP_GET_PSIZE(bp);
                 void *cbuf = zio_buf_alloc(csize);
  
@@ -1790,7 +1791,30 @@ zio_vdev_io_start(zio_t *zio)
  
         ASSERT(P2PHASE(zio->io_offset, align) == 0);
         ASSERT(P2PHASE(zio->io_size, align) == 0);
-       ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+       ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+       /*
+        * If this is a repair I/O, and there's no self-healing involved --
+        * that is, we're just resilvering what we expect to resilver --
+        * then don't do the I/O unless zio's txg is actually in vd's DTL.
+        * This prevents spurious resilvering with nested replication.
+        * For example, given a mirror of mirrors, (A+B)+(C+D), if only
+        * A is out of date, we'll read from C+D, then use the data to
+        * resilver A+B -- but we don't actually want to resilver B, just A.
+        * The top-level mirror has no way to know this, so instead we just
+        * discard unnecessary repairs as we work our way down the vdev tree.
+        * The same logic applies to any form of nested replication:
+        * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+        */
+       if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+           !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+           zio->io_txg != 0 && /* not a delegated i/o */
+           !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+               ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+               ASSERT(zio->io_delegate_list == NULL);
+               zio_vdev_io_bypass(zio);
+               return (ZIO_PIPELINE_CONTINUE);
+       }
  
         if (vd->vdev_ops->vdev_op_leaf &&
             (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
@@ -1806,7 +1830,6 @@ zio_vdev_io_start(zio_t *zio)
                         zio_interrupt(zio);
                         return (ZIO_PIPELINE_STOP);
                 }
-
         }
  
         return (vd->vdev_ops->vdev_op_io_start(zio));
@@ -2157,6 +2180,7 @@ zio_done(zio_t *zio)
                 if ((zio->io_type == ZIO_TYPE_READ ||
                     zio->io_type == ZIO_TYPE_FREE) &&
                     zio->io_error == ENXIO &&
+                   spa->spa_load_state == SPA_LOAD_NONE &&
                     spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
                         zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
  
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index 4e99306..0206dad 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -75,6 +75,7 @@
  #include <sys/vdev_impl.h>
  #include <sys/zvol.h>
  #include <sys/dumphdr.h>
+#include <sys/zil_impl.h>
  
  #include "zfs_namecheck.h"
  
@@ -113,7 +114,6 @@ typedef struct zvol_state {
         uint32_t        zv_total_opens; /* total open count */
         zilog_t         *zv_zilog;      /* ZIL handle */
         list_t          zv_extents;     /* List of extents for dump */
-       uint64_t        zv_txg_assign;  /* txg to assign during ZIL replay */
         znode_t         zv_znode;       /* for range locking */
  } zvol_state_t;
  
@@ -381,7 +381,7 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
  
         tx = dmu_tx_create(os);
         dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
-       error = dmu_tx_assign(tx, zv->zv_txg_assign);
+       error = dmu_tx_assign(tx, TXG_WAIT);
         if (error) {
                 dmu_tx_abort(tx);
         } else {
@@ -558,7 +558,7 @@ zvol_create_minor(const char *name, major_t maj)
         ASSERT(error == 0);
         zv->zv_volblocksize = doi.doi_data_block_size;
  
-       zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
+       zil_replay(os, zv, zvol_replay_vector);
         zvol_size_changed(zv, maj);
  
         /* XXX this should handle the possible i/o error */
@@ -971,8 +971,16 @@ static void
  zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
  {
         uint32_t blocksize = zv->zv_volblocksize;
+       zilog_t *zilog = zv->zv_zilog;
         lr_write_t *lr;
  
+       if (zilog->zl_replay) {
+               dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+               zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+                   zilog->zl_replaying_seq;
+               return;
+       }
+
         while (len) {
                 ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
                 itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
@@ -987,7 +995,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
                 lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
                 BP_ZERO(&lr->lr_blkptr);
  
-               (void) zil_itx_assign(zv->zv_zilog, itx, tx);
+               (void) zil_itx_assign(zilog, itx, tx);
                 len -= nbytes;
                 off += nbytes;
         }
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 15 Jan 2009 21:59:39 +0000 (13:59 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 15 Jan 2009 21:59:39 +0000 (13:59 -0800)
ZFS.RELEASE		patch \| blob \| history
cmd/zdb/zdb.c		patch \| blob \| history
cmd/zfs/zfs_main.c		patch \| blob \| history
cmd/zpool/zpool_main.c		patch \| blob \| history
cmd/ztest/ztest.c		patch \| blob \| history
lib/libzfs/include/libzfs.h		patch \| blob \| history
lib/libzfs/include/libzfs_impl.h		patch \| blob \| history
lib/libzfs/libzfs_dataset.c		patch \| blob \| history
lib/libzfs/libzfs_mount.c		patch \| blob \| history
lib/libzfs/libzfs_pool.c		patch \| blob \| history
lib/libzfs/libzfs_util.c		patch \| blob \| history
module/zfs/arc.c		patch \| blob \| history
module/zfs/dmu_traverse.c		patch \| blob \| history
module/zfs/dnode.c		patch \| blob \| history
module/zfs/dsl_dataset.c		patch \| blob \| history
module/zfs/dsl_scrub.c		patch \| blob \| history
module/zfs/include/sys/spa.h		patch \| blob \| history
module/zfs/include/sys/spa_impl.h		patch \| blob \| history
module/zfs/include/sys/space_map.h		patch \| blob \| history
module/zfs/include/sys/uberblock_impl.h		patch \| blob \| history
module/zfs/include/sys/vdev.h		patch \| blob \| history
module/zfs/include/sys/vdev_impl.h		patch \| blob \| history
module/zfs/include/sys/zfs_vfsops.h		patch \| blob \| history
module/zfs/include/sys/zil.h		patch \| blob \| history
module/zfs/include/sys/zil_impl.h		patch \| blob \| history
module/zfs/include/sys/zio.h		patch \| blob \| history
module/zfs/metaslab.c		patch \| blob \| history
module/zfs/spa.c		patch \| blob \| history
module/zfs/spa_config.c		patch \| blob \| history
module/zfs/spa_misc.c		patch \| blob \| history
module/zfs/space_map.c		patch \| blob \| history
module/zfs/txg.c		patch \| blob \| history
module/zfs/vdev.c		patch \| blob \| history
module/zfs/vdev_file.c		patch \| blob \| history
module/zfs/vdev_label.c		patch \| blob \| history
module/zfs/vdev_mirror.c		patch \| blob \| history
module/zfs/vdev_queue.c		patch \| blob \| history
module/zfs/vdev_raidz.c		patch \| blob \| history
module/zfs/zfs_acl.c		patch \| blob \| history
module/zfs/zfs_byteswap.c		patch \| blob \| history
module/zfs/zfs_dir.c		patch \| blob \| history
module/zfs/zfs_fuid.c		patch \| blob \| history
module/zfs/zfs_ioctl.c		patch \| blob \| history
module/zfs/zfs_log.c		patch \| blob \| history
module/zfs/zfs_vfsops.c		patch \| blob \| history
module/zfs/zfs_vnops.c		patch \| blob \| history
module/zfs/zfs_znode.c		patch \| blob \| history
module/zfs/zil.c		patch \| blob \| history
module/zfs/zio.c		patch \| blob \| history
module/zfs/zvol.c		patch \| blob \| history