X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev.c;h=bac3e86054d6f8e64f0fed061725a621236c8535;hb=572e285762521df27fe5b026f409ba1a21abb7ac;hp=bb5024f985f566e7ab12f1dba70ab1655bc16d26;hpb=45d1cae3b8c949ecc391dd7a5b81963b34c71c29;p=zfs.git

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index bb5024f..bac3e86 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -40,6 +39,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
+#include <sys/dsl_scan.h>
 
 /*
  * Virtual device management.
@@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
+	&vdev_hole_ops,
 	NULL
 };
 
@@ -206,9 +207,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
 }
 
 void
@@ -243,9 +241,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
 }
 
 /*
@@ -281,7 +276,7 @@ vdev_compact_children(vdev_t *pvd)
 /*
  * Allocate and minimally initialize a vdev_t.
  */
-static vdev_t *
+vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
@@ -293,21 +288,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 		spa->spa_root_vdev = vd;
 	}
 
-	if (guid == 0) {
+	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
-			while (guid == 0 || spa_guid_exists(guid, 0))
-				guid = spa_get_random(-1ULL);
+			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
-			while (guid == 0 ||
-			    spa_guid_exists(spa_guid(spa), guid))
-				guid = spa_get_random(-1ULL);
+			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
@@ -318,6 +310,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_ishole = (ops == &vdev_hole_ops);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -397,6 +390,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (ENOTSUP);
 
+	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+		return (ENOTSUP);
+
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
@@ -404,10 +400,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
-			/*
-			 * Currently, we can only support 3 parity devices.
-			 */
-			if (nparity == 0 || nparity > 3)
+			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 				return (EINVAL);
 			/*
 			 * Previous versions could only support 1 or 2 parity
@@ -472,15 +465,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
 	/*
+	 * Retrieve the vdev creation time.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+	    &vd->vdev_crtxg);
+
+	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
-	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+	if (parent && !parent->vdev_parent &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+		    &vd->vdev_removing);
+	}
+
+	if (parent && !parent->vdev_parent) {
+		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+		    alloctype == VDEV_ALLOC_ADD ||
+		    alloctype == VDEV_ALLOC_SPLIT ||
+		    alloctype == VDEV_ALLOC_ROOTPOOL);
+		vd->vdev_mg = metaslab_group_create(islog ?
+		    spa_log_class(spa) : spa_normal_class(spa), vd);
 	}
 
 	/*
@@ -507,18 +518,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+		    &vd->vdev_resilvering);
+
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
-		 * valid in the current context.
+		 * valid in the current context.  Local vdevs will
+		 * remain in the faulted state.
 		 */
-		if (spa->spa_load_state == SPA_LOAD_OPEN) {
+		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
+
+			if (vd->vdev_faulted || vd->vdev_degraded) {
+				char *aux;
+
+				vd->vdev_label_aux =
+				    VDEV_AUX_ERR_EXCEEDED;
+				if (nvlist_lookup_string(nv,
+				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+				    strcmp(aux, "external") == 0)
+					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+			}
 		}
 	}
 
@@ -544,6 +570,7 @@ vdev_free(vdev_t *vd)
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
@@ -557,8 +584,10 @@ vdev_free(vdev_t *vd)
 	/*
 	 * Discard allocation state.
 	 */
-	if (vd == vd->vdev_top)
+	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
+		metaslab_group_destroy(vd->vdev_mg);
+	}
 
 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
 	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -705,6 +734,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_state = cvd->vdev_state;
+	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
@@ -746,6 +776,7 @@ vdev_remove_parent(vdev_t *cvd)
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 	}
@@ -765,16 +796,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
-	metaslab_class_t *mc;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 
-	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
+	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	/*
+	 * This vdev is not being allocated from yet or is a hole.
+	 */
+	if (vd->vdev_ms_shift == 0)
 		return (0);
 
+	ASSERT(!vd->vdev_ishole);
+
 	/*
 	 * Compute the raidz-deflation ratio.  Note, we hard-code
 	 * in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -786,14 +823,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
 	ASSERT(oldc <= newc);
 
-	if (vd->vdev_islog)
-		mc = spa->spa_log_class;
-	else
-		mc = spa->spa_normal_class;
-
-	if (vd->vdev_mg == NULL)
-		vd->vdev_mg = metaslab_group_create(mc, vd);
-
 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (oldc != 0) {
@@ -828,6 +857,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
 	}
 
+	if (txg == 0)
+		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+	/*
+	 * If the vdev is being removed we don't activate
+	 * the metaslabs since we want to ensure that no new
+	 * allocations are performed on this device.
+	 */
+	if (oldc == 0 && !vd->vdev_removing)
+		metaslab_group_activate(vd->vdev_mg);
+
+	if (txg == 0)
+		spa_config_exit(spa, SCL_ALLOC, FTAG);
+
 	return (0);
 }
 
@@ -838,6 +881,7 @@ vdev_metaslab_fini(vdev_t *vd)
 	uint64_t count = vd->vdev_ms_count;
 
 	if (vd->vdev_ms != NULL) {
+		metaslab_group_passivate(vd->vdev_mg);
 		for (m = 0; m < count; m++)
 			if (vd->vdev_ms[m] != NULL)
 				metaslab_fini(vd->vdev_ms[m]);
@@ -965,6 +1009,10 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
+		/*
+		 * We can't change the vdev state in this context, so we
+		 * kick off an async task to do it on our behalf.
+		 */
 		if (zio != NULL) {
 			vd->vdev_probe_wanted = B_TRUE;
 			spa_async_request(spa, SPA_ASYNC_PROBE);
@@ -1007,12 +1055,35 @@ vdev_open_child(void *arg)
 	vd->vdev_open_thread = NULL;
 }
 
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+	    strlen(ZVOL_DIR)) == 0)
+		return (B_TRUE);
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_uses_zvols(vd->vdev_child[c]))
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
 void
 vdev_open_children(vdev_t *vd)
 {
 	taskq_t *tq;
 	int children = vd->vdev_children;
 
+	/*
+	 * in order to handle pools on top of zvols, do the opens
+	 * in a single thread so that the same thread holds the
+	 * spa_namespace_lock
+	 */
+	if (vdev_uses_zvols(vd)) {
+		for (int c = 0; c < children; c++)
+			vd->vdev_child[c]->vdev_open_error =
+			    vdev_open(vd->vdev_child[c]);
+		return;
+	}
 	tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 
@@ -1046,10 +1117,16 @@ vdev_open(vdev_t *vd)
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
+	/*
+	 * If this vdev is not removed, check its fault status.  If it's
+	 * faulted, bail out of the open.
+	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
+		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-		    VDEV_AUX_ERR_EXCEEDED);
+		    vd->vdev_label_aux);
 		return (ENXIO);
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
@@ -1059,6 +1136,11 @@ vdev_open(vdev_t *vd)
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
 
+	/*
+	 * Reset the vdev_reopening flag so that we actually close
+	 * the vdev on error.
+	 */
+	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, ENXIO);
 
@@ -1074,14 +1156,33 @@ vdev_open(vdev_t *vd)
 
 	vd->vdev_removed = B_FALSE;
 
+	/*
+	 * Recheck the faulted flag now that we have confirmed that
+	 * the vdev is accessible.  If we're faulted, bail.
+	 */
+	if (vd->vdev_faulted) {
+		ASSERT(vd->vdev_children == 0);
+		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    vd->vdev_label_aux);
+		return (ENXIO);
+	}
+
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
-		vd->vdev_state = VDEV_STATE_HEALTHY;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
+	/*
+	 * For hole or missing vdevs we just return success.
+	 */
+	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+		return (0);
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1157,8 +1258,8 @@ vdev_open(vdev_t *vd)
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_IO_FAILURE);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
@@ -1189,7 +1290,7 @@ vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
-	uint64_t guid, top_guid;
+	uint64_t guid = 0, top_guid;
 	uint64_t state;
 
 	for (int c = 0; c < vd->vdev_children; c++)
@@ -1202,6 +1303,8 @@ vdev_validate(vdev_t *vd)
 	 * overwrite the previous state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+		uint64_t aux_guid = 0;
+		nvlist_t *nvl;
 
 		if ((label = vdev_label_read_config(vd)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1209,6 +1312,18 @@ vdev_validate(vdev_t *vd)
 			return (0);
 		}
 
+		/*
+		 * Determine if this vdev has been split off into another
+		 * pool.  If so, then refuse to open it.
+		 */
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+		    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_SPLIT_POOL);
+			nvlist_free(label);
+			return (0);
+		}
+
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) != 0 || guid != spa_guid(spa)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1217,6 +1332,11 @@ vdev_validate(vdev_t *vd)
 			return (0);
 		}
 
+		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+		    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+		    &aux_guid) != 0)
+			aux_guid = 0;
+
 		/*
 		 * If this vdev just became a top-level vdev because its
 		 * sibling was detached, it will have adopted the parent's
@@ -1224,12 +1344,16 @@ vdev_validate(vdev_t *vd)
 		 * Fortunately, either version of the label will have the
 		 * same top guid, so if we're a top-level vdev, we can
 		 * safely compare to that instead.
+		 *
+		 * If we split this vdev off instead, then we also check the
+		 * original pool's guid.  We don't want to consider the vdev
+		 * corrupt if it is partway through a split operation.
 		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 		    &guid) != 0 ||
 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0 ||
-		    (vd->vdev_guid != guid &&
+		    ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
 		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
@@ -1248,11 +1372,11 @@ vdev_validate(vdev_t *vd)
 		nvlist_free(label);
 
 		/*
-		 * If spa->spa_load_verbatim is true, no need to check the
+		 * If this is a verbatim import, no need to check the
 		 * state of the pool.
 		 */
-		if (!spa->spa_load_verbatim &&
-		    spa->spa_load_state == SPA_LOAD_OPEN &&
+		if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+		    spa_load_state(spa) == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (EBADF);
 
@@ -1275,9 +1399,17 @@ void
 vdev_close(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
+	vdev_t *pvd = vd->vdev_parent;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
+	/*
+	 * If our parent is reopening, then we are as well, unless we are
+	 * going offline.
+	 */
+	if (pvd != NULL && pvd->vdev_reopening)
+		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
@@ -1297,12 +1429,49 @@ vdev_close(vdev_t *vd)
 }
 
 void
+vdev_hold(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_is_root(spa));
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+		return;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_hold(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_is_root(spa));
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_rele(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves.  We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
+	/* set the reopening flag unless we're taking the vdev offline */
+	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
@@ -1370,7 +1539,9 @@ void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
+	ASSERT(!vd->vdev_ishole);
 	ASSERT(ISP2(flags));
+	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
@@ -1385,7 +1556,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication.  There are three kinds of DTL:
+ * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
@@ -1426,6 +1597,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(sm->sm_lock);
 	if (!space_map_contains(sm, txg, size))
@@ -1479,14 +1651,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done);
 
-	if (vd == spa->spa_root_vdev)
+	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
+		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (scrub_txg != 0 &&
-		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
-			/* XXX should check scrub_done? */
+		    (spa->spa_scrub_started ||
+		    (scn && scn->scn_phys.scn_errors == 0))) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
@@ -1534,6 +1708,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	for (int t = 0; t < DTL_TYPES; t++) {
+		/* account for child's outage in parent's missing map */
+		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 		if (t == DTL_SCRUB)
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
@@ -1546,7 +1722,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
-			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
 		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
@@ -1569,6 +1745,8 @@ vdev_dtl_load(vdev_t *vd)
 	if (smo->smo_object == 0)
 		return (0);
 
+	ASSERT(!vd->vdev_ishole);
+
 	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
 		return (error);
 
@@ -1596,6 +1774,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 
+	ASSERT(!vd->vdev_ishole);
+
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached) {
@@ -1674,6 +1854,9 @@ vdev_dtl_required(vdev_t *vd)
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
+	if (!required && zio_injection_enabled)
+		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
 	return (required);
 }
 
@@ -1732,7 +1915,7 @@ vdev_load(vdev_t *vd)
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
-	if (vd == vd->vdev_top &&
+	if (vd == vd->vdev_top && !vd->vdev_ishole &&
 	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
 	    vdev_metaslab_init(vd, 0) != 0))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1789,12 +1972,54 @@ vdev_validate_aux(vdev_t *vd)
 }
 
 void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	if (vd->vdev_dtl_smo.smo_object) {
+		ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+		vd->vdev_dtl_smo.smo_object = 0;
+	}
+
+	if (vd->vdev_ms != NULL) {
+		for (int m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			if (msp == NULL || msp->ms_smo.smo_object == 0)
+				continue;
+
+			ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+			msp->ms_smo.smo_object = 0;
+		}
+	}
+
+	if (vd->vdev_ms_array) {
+		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+		vd->vdev_ms_array = 0;
+		vd->vdev_ms_shift = 0;
+	}
+	dmu_tx_commit(tx);
+}
+
+void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
+	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+	ASSERT(!vd->vdev_ishole);
 
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
+
+	if (reassess)
+		metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void
@@ -1805,6 +2030,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	metaslab_t *msp;
 	dmu_tx_t *tx;
 
+	ASSERT(!vd->vdev_ishole);
+
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
 		ASSERT(vd == vd->vdev_top);
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1815,6 +2042,12 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		dmu_tx_commit(tx);
 	}
 
+	/*
+	 * Remove the metadata associated with this vdev once it's empty.
+	 */
+	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+		vdev_remove(vd, txg);
+
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -1837,11 +2070,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
  * not be opened, and no I/O is attempted.
  */
 int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
-	vdev_t *vd;
+	vdev_t *vd, *tvd;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1849,19 +2082,28 @@ vdev_fault(spa_t *spa, uint64_t guid)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	tvd = vd->vdev_top;
+
+	/*
+	 * We don't directly use the aux state here, but if we do a
+	 * vdev_reopen(), we need this value to be present to remember why we
+	 * were faulted.
+	 */
+	vd->vdev_label_aux = aux;
+
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
+	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
-	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
-	 * If marking the vdev as faulted cause the top-level vdev to become
-	 * unavailable, then back off and simply mark the vdev as degraded
-	 * instead.
+	 * If this device has the only valid copy of the data, then
+	 * back off and simply mark the vdev as degraded instead.
 	 */
-	if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
@@ -1869,12 +2111,10 @@ vdev_fault(spa_t *spa, uint64_t guid)
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
-		vdev_reopen(vd);
+		vdev_reopen(tvd);
 
-		if (vdev_readable(vd)) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-			    VDEV_AUX_ERR_EXCEEDED);
-		}
+		if (vdev_readable(vd))
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
@@ -1886,11 +2126,11 @@ vdev_fault(spa_t *spa, uint64_t guid)
  * as I/O is concerned.
  */
 int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1907,7 +2147,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-		    VDEV_AUX_ERR_EXCEEDED);
+		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
@@ -1923,7 +2163,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1969,13 +2209,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
-	int error;
+	int error = 0;
+	uint64_t generation;
+	metaslab_group_t *mg;
 
-	spa_vdev_state_enter(spa);
+top:
+	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1984,6 +2227,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
+	mg = tvd->vdev_mg;
+	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
@@ -1999,6 +2244,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
+		 * If the top-level is a slog and it has had allocations
+		 * then proceed.  We check that the vdev's metaslab group
+		 * is not NULL since it's possible that we may have just
+		 * added this vdev but not yet initialized its metaslabs.
+		 */
+		if (tvd->vdev_islog && mg != NULL) {
+			/*
+			 * Prevent any future allocations.
+			 */
+			metaslab_group_passivate(mg);
+			(void) spa_vdev_state_exit(spa, vd, 0);
+
+			error = spa_offline_log(spa);
+
+			spa_vdev_state_enter(spa, SCL_ALLOC);
+
+			/*
+			 * Check to see if the config has changed.
+			 */
+			if (error || generation != spa->spa_config_generation) {
+				metaslab_group_activate(mg);
+				if (error)
+					return (spa_vdev_state_exit(spa,
+					    vd, error));
+				(void) spa_vdev_state_exit(spa, vd, 0);
+				goto top;
+			}
+			ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+		}
+
+		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
@@ -2013,28 +2289,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
+
+		/*
+		 * Add the device back into the metaslab rotor so that
+		 * once we online the device it's open for business.
+		 */
+		if (tvd->vdev_islog && mg != NULL)
+			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
-	if (!tvd->vdev_islog || !vdev_is_dead(tvd))
-		return (spa_vdev_state_exit(spa, vd, 0));
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
 
-	(void) spa_vdev_state_exit(spa, vd, 0);
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+	int error;
 
-	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-	    NULL, DS_FIND_CHILDREN);
-	if (error) {
-		(void) vdev_online(spa, guid, 0, NULL);
-		return (error);
-	}
-	/*
-	 * If we successfully offlined the log device then we need to
-	 * sync out the current txg so that the "stubby" block can be
-	 * removed by zil_sync().
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-	return (0);
+	mutex_enter(&spa->spa_vdev_top_lock);
+	error = vdev_offline_locked(spa, guid, flags);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
 }
 
 /*
@@ -2068,13 +2346,22 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 
-		vd->vdev_faulted = vd->vdev_degraded = 0;
+		/*
+		 * When reopening in reponse to a clear event, it may be due to
+		 * a fmadm repair request.  In this case, if the device is
+		 * still broken, we want to still post the ereport again.
+		 */
+		vd->vdev_forcefault = B_TRUE;
+
+		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 
-		vdev_reopen(vd);
+		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
-		if (vd != rvd)
+		vd->vdev_forcefault = B_FALSE;
+
+		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
@@ -2082,12 +2369,30 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 
 		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
 	}
+
+	/*
+	 * When clearing a FMA-diagnosed fault, we always want to
+	 * unspare the device, as we assume that the original spare was
+	 * done in response to the FMA fault.
+	 */
+	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    vd->vdev_parent->vdev_child[0] == vd)
+		vd->vdev_unspare = B_TRUE;
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
-	return (vd->vdev_state < VDEV_STATE_DEGRADED);
+	/*
+	 * Holes and missing devices are always considered "dead".
+	 * This simplifies the code since we don't have to check for
+	 * these types of devices in the various code paths.
+	 * Instead we rely on the fact that we skip over dead devices
+	 * before issuing I/O to them.
+	 */
+	return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
@@ -2116,7 +2421,7 @@ vdev_allocatable(vdev_t *vd)
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write);
+	    !vd->vdev_cant_write && !vd->vdev_ishole);
 }
 
 boolean_t
@@ -2146,7 +2451,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
-	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
@@ -2168,7 +2472,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
-			vs->vs_scrub_examined += cvs->vs_scrub_examined;
+			cvs->vs_scan_removing = cvd->vdev_removing;
 			mutex_exit(&vd->vdev_stat_lock);
 		}
 	}
@@ -2185,6 +2489,19 @@ vdev_clear_stats(vdev_t *vd)
 }
 
 void
+vdev_scan_stat_init(vdev_t *vd)
+{
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_scan_stat_init(vd->vdev_child[c]);
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vs->vs_scan_processed = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
@@ -2228,8 +2545,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			if (flags & ZIO_FLAG_SCRUB_THREAD)
-				vs->vs_scrub_repaired += psize;
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
+				dsl_scan_phys_t *scn_phys =
+				    &spa->spa_dsl_pool->dp_scan->scn_phys;
+				uint64_t *processed = &scn_phys->scn_processed;
+
+				/* XXX cleanup? */
+				if (vd->vdev_ops->vdev_op_leaf)
+					atomic_add_64(processed, psize);
+				vs->vs_scan_processed += psize;
+			}
+
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
@@ -2254,6 +2580,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
+	/*
+	 * Intent logs writes won't propagate their error to the root
+	 * I/O so don't mark these types of failures as pool-level
+	 * errors.
+	 */
+	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		return;
+
 	mutex_enter(&vd->vdev_stat_lock);
 	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
@@ -2267,14 +2601,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
-	    (flags & ZIO_FLAG_SCRUB_THREAD))) {
+	    (flags & ZIO_FLAG_SCAN_THREAD) ||
+	    spa->spa_claiming)) {
 		/*
-		 * This is either a normal write (not a repair), or it's a
-		 * repair induced by the scrub thread.  In the normal case,
-		 * we commit the DTL change in the same txg as the block
-		 * was born.  In the scrub-induced repair case, we know that
-		 * scrubs run in first-pass syncing context, so we commit
-		 * the DTL change in spa->spa_syncing_txg.
+		 * This is either a normal write (not a repair), or it's
+		 * a repair induced by the scrub thread, or it's a repair
+		 * made by zil_claim() during spa_load() in the first txg.
+		 * In the normal case, we commit the DTL change in the same
+		 * txg as the block was born.  In the scrub-induced repair
+		 * case, we know that scrubs run in first-pass syncing context,
+		 * so we commit the DTL change in spa_syncing_txg(spa).
+		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
@@ -2283,13 +2620,16 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
-			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
-				commit_txg = spa->spa_syncing_txg;
+				commit_txg = spa_syncing_txg(spa);
+			} else if (spa->spa_claiming) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				commit_txg = spa_first_txg(spa);
 			}
-			ASSERT(commit_txg >= spa->spa_syncing_txg);
+			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2301,45 +2641,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 	}
 }
 
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
-	vdev_stat_t *vs = &vd->vdev_stat;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
-	mutex_enter(&vd->vdev_stat_lock);
-
-	if (type == POOL_SCRUB_NONE) {
-		/*
-		 * Update completion and end time.  Leave everything else alone
-		 * so we can report what happened during the previous scrub.
-		 */
-		vs->vs_scrub_complete = complete;
-		vs->vs_scrub_end = gethrestime_sec();
-	} else {
-		vs->vs_scrub_type = type;
-		vs->vs_scrub_complete = 0;
-		vs->vs_scrub_examined = 0;
-		vs->vs_scrub_repaired = 0;
-		vs->vs_scrub_start = gethrestime_sec();
-		vs->vs_scrub_end = 0;
-	}
-
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
 /*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
  */
 void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
-    boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta)
 {
 	int64_t dspace_delta = space_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_group_t *mg = vd->vdev_mg;
+	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
 
 	ASSERT(vd == vd->vdev_top);
 
@@ -2355,28 +2669,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
 	    vd->vdev_deflate_ratio;
 
 	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_alloc += alloc_delta;
+	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (update_root) {
-		ASSERT(rvd == vd->vdev_parent);
-		ASSERT(vd->vdev_ms_count != 0);
-
-		/*
-		 * Don't count non-normal (e.g. intent log) space as part of
-		 * the pool's capacity.
-		 */
-		if (vd->vdev_mg->mg_class != spa->spa_normal_class)
-			return;
-
+	if (mc == spa_normal_class(spa)) {
 		mutex_enter(&rvd->vdev_stat_lock);
-		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_alloc += alloc_delta;
+		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
+
+	if (mc != NULL) {
+		ASSERT(rvd == vd->vdev_parent);
+		ASSERT(vd->vdev_ms_count != 0);
+
+		metaslab_class_space_update(mc,
+		    alloc_delta, defer_delta, space_delta, dspace_delta);
+	}
 }
 
 /*
@@ -2391,6 +2703,8 @@ vdev_config_dirty(vdev_t *vd)
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
+	ASSERT(spa_writeable(spa));
+
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
@@ -2428,7 +2742,7 @@ vdev_config_dirty(vdev_t *vd)
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
-		aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
@@ -2449,7 +2763,8 @@ vdev_config_dirty(vdev_t *vd)
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
-		if (!list_link_active(&vd->vdev_config_dirty_node))
+		if (!list_link_active(&vd->vdev_config_dirty_node) &&
+		    !vd->vdev_ishole)
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 	}
 }
@@ -2478,6 +2793,7 @@ vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
+	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
@@ -2490,7 +2806,7 @@ vdev_state_dirty(vdev_t *vd)
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
-	if (!list_link_active(&vd->vdev_state_dirty_node))
+	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
@@ -2523,6 +2839,12 @@ vdev_propagate_state(vdev_t *vd)
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
+			/*
+			 * Don't factor holes into the decision.
+			 */
+			if (child->vdev_ishole)
+				continue;
+
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
@@ -2586,15 +2908,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
-	 * always close the underlying device.  Otherwise, we keep accessible
-	 * but invalid devices open forever.  We don't call vdev_close() itself,
-	 * because that implies some extra checks (offline, etc) that we don't
-	 * want here.  This is limited to leaf devices, because otherwise
-	 * closing the device will affect other children.
+	 * always close the underlying device unless the device has requested
+	 * a delayed close (i.e. we're about to remove or fault the device).
+	 * Otherwise, we keep accessible but invalid devices open forever.
+	 * We don't call vdev_close() itself, because that implies some extra
+	 * checks (offline, etc) that we don't want here.  This is limited to
+	 * leaf devices, because otherwise closing the device will affect other
+	 * children.
 	 */
-	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
+	/*
+	 * If we have brought this vdev back into service, we need
+	 * to notify fmd so that it can gracefully repair any outstanding
+	 * cases due to a missing device.  We do this in all cases, even those
+	 * that probably don't correlate to a repaired fault.  This is sure to
+	 * catch all cases, and we let the zfs-retire agent sort it out.  If
+	 * this is a transient state it's OK, as the retire agent will
+	 * double-check the state of the vdev before repairing it.
+	 */
+	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_prevstate != state)
+		zfs_post_state_change(spa, vd);
+
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2610,20 +2948,16 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
-		/*
-		 * Indicate to the ZFS DE that this device has been removed, and
-		 * any recent errors should be ignored.
-		 */
-		zfs_post_remove(spa, vd);
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
-		 * If we fail to open a vdev during an import, we mark it as
-		 * "not available", which signifies that it was never there to
-		 * begin with.  Failure to open such a device is not considered
-		 * an error.
+		 * If we fail to open a vdev during an import or recovery, we
+		 * mark it as "not available", which signifies that it was
+		 * never there to begin with.  Failure to open such a device
+		 * is not considered an error.
 		 */
-		if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
@@ -2666,9 +3000,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
-			case VDEV_AUX_IO_FAILURE:
-				class = FM_EREPORT_ZFS_IO_FAILURE;
-				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
@@ -2716,36 +3047,55 @@ vdev_is_bootable(vdev_t *vd)
 	return (B_TRUE);
 }
 
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
+ */
 void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
-	uint_t children;
-	nvlist_t **child;
-	uint64_t val;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = nvd->vdev_spa;
 
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0) {
-		for (int c = 0; c < children; c++)
-			vdev_load_log_state(vd->vdev_child[c], child[c]);
-	}
+	ASSERT(nvd->vdev_top->vdev_islog);
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
-	if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
-	    ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+	for (int c = 0; c < nvd->vdev_children; c++)
+		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
+	if (nvd->vdev_ops->vdev_op_leaf) {
 		/*
-		 * It would be nice to call vdev_offline()
-		 * directly but the pool isn't fully loaded and
-		 * the txg threads have not been started yet.
+		 * Restore the persistent vdev state
 		 */
-		spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
-		vd->vdev_offline = val;
-		vdev_reopen(vd->vdev_top);
-		spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+		nvd->vdev_offline = ovd->vdev_offline;
+		nvd->vdev_faulted = ovd->vdev_faulted;
+		nvd->vdev_degraded = ovd->vdev_degraded;
+		nvd->vdev_removed = ovd->vdev_removed;
 	}
 }
 
 /*
+ * Determine if a log device has valid content.  If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+	    !vd->vdev_removed)
+		return (B_TRUE);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_log_state_valid(vd->vdev_child[c]))
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
  * Expand a vdev if possible.
  */
 void
@@ -2759,3 +3109,22 @@ vdev_expand(vdev_t *vd, uint64_t txg)
 		vdev_config_dirty(vd);
 	}
 }
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+	vdev_t *cvd, *pvd = vd->vdev_parent;
+
+	vdev_remove_child(pvd, vd);
+	vdev_compact_children(pvd);
+
+	cvd = pvd->vdev_child[0];
+	if (pvd->vdev_children == 1) {
+		vdev_remove_parent(cvd);
+		cvd->vdev_splitting = B_TRUE;
+	}
+	vdev_propagate_state(cvd);
+}