X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev.c;h=bac3e86054d6f8e64f0fed061725a621236c8535;hb=572e285762521df27fe5b026f409ba1a21abb7ac;hp=bb5024f985f566e7ab12f1dba70ab1655bc16d26;hpb=45d1cae3b8c949ecc391dd7a5b81963b34c71c29;p=zfs.git diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index bb5024f..bac3e86 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -40,6 +39,7 @@ #include #include #include +#include /* * Virtual device management. @@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = { &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, + &vdev_hole_ops, NULL }; @@ -206,9 +207,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; } void @@ -243,9 +241,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; } /* @@ -281,7 +276,7 @@ vdev_compact_children(vdev_t *pvd) /* * Allocate and minimally initialize a vdev_t. */ -static vdev_t * +vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; @@ -293,21 +288,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) spa->spa_root_vdev = vd; } - if (guid == 0) { + if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ - while (guid == 0 || - spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } @@ -318,6 +310,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_ishole = (ops == &vdev_hole_ops); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); @@ -397,6 +390,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (ENOTSUP); + if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) + return (ENOTSUP); + /* * Set the nparity property for RAID-Z vdevs. */ @@ -404,10 +400,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { - /* - * Currently, we can only support 3 parity devices. - */ - if (nparity == 0 || nparity > 3) + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (EINVAL); /* * Previous versions could only support 1 or 2 parity @@ -472,15 +465,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* + * Retrieve the vdev creation time. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + &vd->vdev_crtxg); + + /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + if (parent && !parent->vdev_parent && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); + } + + if (parent && !parent->vdev_parent) { + ASSERT(alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_ADD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL); + vd->vdev_mg = metaslab_group_create(islog ? + spa_log_class(spa) : spa_normal_class(spa), vd); } /* @@ -507,18 +518,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, + &vd->vdev_resilvering); + /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be - * valid in the current context. + * valid in the current context. Local vdevs will + * remain in the faulted state. */ - if (spa->spa_load_state == SPA_LOAD_OPEN) { + if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); + + if (vd->vdev_faulted || vd->vdev_degraded) { + char *aux; + + vd->vdev_label_aux = + VDEV_AUX_ERR_EXCEEDED; + if (nvlist_lookup_string(nv, + ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && + strcmp(aux, "external") == 0) + vd->vdev_label_aux = VDEV_AUX_EXTERNAL; + } } } @@ -544,6 +570,7 @@ vdev_free(vdev_t *vd) vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. @@ -557,8 +584,10 @@ vdev_free(vdev_t *vd) /* * Discard allocation state. */ - if (vd == vd->vdev_top) + if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + } ASSERT3U(vd->vdev_stat.vs_space, ==, 0); ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); @@ -705,6 +734,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; + mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); @@ -746,6 +776,7 @@ vdev_remove_parent(vdev_t *cvd) */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } @@ -765,16 +796,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; - metaslab_class_t *mc; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; - if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ + ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + /* + * This vdev is not being allocated from yet or is a hole. + */ + if (vd->vdev_ms_shift == 0) return (0); + ASSERT(!vd->vdev_ishole); + /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the current "typical" blocksize. @@ -786,14 +823,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) ASSERT(oldc <= newc); - if (vd->vdev_islog) - mc = spa->spa_log_class; - else - mc = spa->spa_normal_class; - - if (vd->vdev_mg == NULL) - vd->vdev_mg = metaslab_group_create(mc, vd); - mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { @@ -828,6 +857,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); } + if (txg == 0) + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); + + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (oldc == 0 && !vd->vdev_removing) + metaslab_group_activate(vd->vdev_mg); + + if (txg == 0) + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (0); } @@ -838,6 +881,7 @@ vdev_metaslab_fini(vdev_t *vd) uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { + metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) if (vd->vdev_ms[m] != NULL) metaslab_fini(vd->vdev_ms[m]); @@ -965,6 +1009,10 @@ vdev_probe(vdev_t *vd, zio_t *zio) vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + /* + * We can't change the vdev state in this context, so we + * kick off an async task to do it on our behalf. + */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); @@ -1007,12 +1055,35 @@ vdev_open_child(void *arg) vd->vdev_open_thread = NULL; } +boolean_t +vdev_uses_zvols(vdev_t *vd) +{ + if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, + strlen(ZVOL_DIR)) == 0) + return (B_TRUE); + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_uses_zvols(vd->vdev_child[c])) + return (B_TRUE); + return (B_FALSE); +} + void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; + /* + * in order to handle pools on top of zvols, do the opens + * in a single thread so that the same thread holds the + * spa_namespace_lock + */ + if (vdev_uses_zvols(vd)) { + for (int c = 0; c < children; c++) + vd->vdev_child[c]->vdev_open_error = + vdev_open(vd->vdev_child[c]); + return; + } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); @@ -1046,10 +1117,16 @@ vdev_open(vdev_t *vd) vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); + /* + * If this vdev is not removed, check its fault status. If it's + * faulted, bail out of the open. + */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - VDEV_AUX_ERR_EXCEEDED); + vd->vdev_label_aux); return (ENXIO); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); @@ -1059,6 +1136,11 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + /* + * Reset the vdev_reopening flag so that we actually close + * the vdev on error. + */ + vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); @@ -1074,14 +1156,33 @@ vdev_open(vdev_t *vd) vd->vdev_removed = B_FALSE; + /* + * Recheck the faulted flag now that we have confirmed that + * the vdev is accessible. If we're faulted, bail. + */ + if (vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (ENXIO); + } + if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { - vd->vdev_state = VDEV_STATE_HEALTHY; + vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } + /* + * For hole or missing vdevs we just return success. + */ + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) + return (0); + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, @@ -1157,8 +1258,8 @@ vdev_open(vdev_t *vd) */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_IO_FAILURE); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); return (error); } @@ -1189,7 +1290,7 @@ vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; - uint64_t guid, top_guid; + uint64_t guid = 0, top_guid; uint64_t state; for (int c = 0; c < vd->vdev_children; c++) @@ -1202,6 +1303,8 @@ vdev_validate(vdev_t *vd) * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + uint64_t aux_guid = 0; + nvlist_t *nvl; if ((label = vdev_label_read_config(vd)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1209,6 +1312,18 @@ vdev_validate(vdev_t *vd) return (0); } + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + return (0); + } + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1217,6 +1332,11 @@ vdev_validate(vdev_t *vd) return (0); } + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's @@ -1224,12 +1344,16 @@ vdev_validate(vdev_t *vd) * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || - (vd->vdev_guid != guid && + ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -1248,11 +1372,11 @@ vdev_validate(vdev_t *vd) nvlist_free(label); /* - * If spa->spa_load_verbatim is true, no need to check the + * If this is a verbatim import, no need to check the * state of the pool. */ - if (!spa->spa_load_verbatim && - spa->spa_load_state == SPA_LOAD_OPEN && + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && + spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -1275,9 +1399,17 @@ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + /* + * If our parent is reopening, then we are as well, unless we are + * going offline. + */ + if (pvd != NULL && pvd->vdev_reopening) + vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); + vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); @@ -1297,12 +1429,49 @@ vdev_close(vdev_t *vd) } void +vdev_hold(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + return; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_hold(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_hold(vd); +} + +void +vdev_rele(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + for (int c = 0; c < vd->vdev_children; c++) + vdev_rele(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_rele(vd); +} + +/* + * Reopen all interior vdevs and any unopened leaves. We don't actually + * reopen leaf vdevs which had previously been opened as they might deadlock + * on the spa_config_lock. Instead we only obtain the leaf's physical size. + * If the leaf has never been opened then open it, as usual. + */ +void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + /* set the reopening flag unless we're taking the vdev offline */ + vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); @@ -1370,7 +1539,9 @@ void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); + ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); @@ -1385,7 +1556,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which - * the vdev has less than perfect replication. There are three kinds of DTL: + * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * @@ -1426,6 +1597,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); + ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(sm->sm_lock); if (!space_map_contains(sm, txg, size)) @@ -1479,14 +1651,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); - if (vd == spa->spa_root_vdev) + if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && - (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { - /* XXX should check scrub_done? */ + (spa->spa_scrub_started || + (scn && scn->scn_phys.scn_errors == 0))) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl @@ -1534,6 +1708,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) @@ -1546,7 +1722,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); - space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1); + space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); @@ -1569,6 +1745,8 @@ vdev_dtl_load(vdev_t *vd) if (smo->smo_object == 0) return (0); + ASSERT(!vd->vdev_ishole); + if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) return (error); @@ -1596,6 +1774,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) dmu_buf_t *db; dmu_tx_t *tx; + ASSERT(!vd->vdev_ishole); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached) { @@ -1674,6 +1854,9 @@ vdev_dtl_required(vdev_t *vd) vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + if (!required && zio_injection_enabled) + required = !!zio_handle_device_injection(vd, NULL, ECHILD); + return (required); } @@ -1732,7 +1915,7 @@ vdev_load(vdev_t *vd) /* * If this is a top-level vdev, initialize its metaslabs. */ - if (vd == vd->vdev_top && + if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1789,12 +1972,54 @@ vdev_validate_aux(vdev_t *vd) } void +vdev_remove(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + if (vd->vdev_dtl_smo.smo_object) { + ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); + vd->vdev_dtl_smo.smo_object = 0; + } + + if (vd->vdev_ms != NULL) { + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp == NULL || msp->ms_smo.smo_object == 0) + continue; + + ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); + msp->ms_smo.smo_object = 0; + } + } + + if (vd->vdev_ms_array) { + (void) dmu_object_free(mos, vd->vdev_ms_array, tx); + vd->vdev_ms_array = 0; + vd->vdev_ms_shift = 0; + } + dmu_tx_commit(tx); +} + +void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; + boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); + + ASSERT(!vd->vdev_ishole); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); + + if (reassess) + metaslab_sync_reassess(vd->vdev_mg); } void @@ -1805,6 +2030,8 @@ vdev_sync(vdev_t *vd, uint64_t txg) metaslab_t *msp; dmu_tx_t *tx; + ASSERT(!vd->vdev_ishole); + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1815,6 +2042,12 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } + /* + * Remove the metadata associated with this vdev once it's empty. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) + vdev_remove(vd, txg); + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); @@ -1837,11 +2070,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize) * not be opened, and no I/O is attempted. */ int -vdev_fault(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { - vdev_t *vd; + vdev_t *vd, *tvd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1849,19 +2082,28 @@ vdev_fault(spa_t *spa, uint64_t guid) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + + /* + * We don't directly use the aux state here, but if we do a + * vdev_reopen(), we need this value to be present to remember why we + * were faulted. + */ + vd->vdev_label_aux = aux; + /* * Faulted state takes precedence over degraded. */ + vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; - vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* - * If marking the vdev as faulted cause the top-level vdev to become - * unavailable, then back off and simply mark the vdev as degraded - * instead. + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. */ - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -1869,12 +2111,10 @@ vdev_fault(spa_t *spa, uint64_t guid) * If we reopen the device and it's not dead, only then do we * mark it degraded. */ - vdev_reopen(vd); + vdev_reopen(tvd); - if (vdev_readable(vd)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); - } + if (vdev_readable(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); @@ -1886,11 +2126,11 @@ vdev_fault(spa_t *spa, uint64_t guid) * as I/O is concerned. */ int -vdev_degrade(spa_t *spa, uint64_t guid) +vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1907,7 +2147,7 @@ vdev_degrade(spa_t *spa, uint64_t guid) vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); + aux); return (spa_vdev_state_exit(spa, vd, 0)); } @@ -1923,7 +2163,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1969,13 +2209,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) return (spa_vdev_state_exit(spa, vd, 0)); } -int -vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +static int +vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; - int error; + int error = 0; + uint64_t generation; + metaslab_group_t *mg; - spa_vdev_state_enter(spa); +top: + spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1984,6 +2227,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; + mg = tvd->vdev_mg; + generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. @@ -1999,6 +2244,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* + * If the top-level is a slog and it has had allocations + * then proceed. We check that the vdev's metaslab group + * is not NULL since it's possible that we may have just + * added this vdev but not yet initialized its metaslabs. + */ + if (tvd->vdev_islog && mg != NULL) { + /* + * Prevent any future allocations. + */ + metaslab_group_passivate(mg); + (void) spa_vdev_state_exit(spa, vd, 0); + + error = spa_offline_log(spa); + + spa_vdev_state_enter(spa, SCL_ALLOC); + + /* + * Check to see if the config has changed. + */ + if (error || generation != spa->spa_config_generation) { + metaslab_group_activate(mg); + if (error) + return (spa_vdev_state_exit(spa, + vd, error)); + (void) spa_vdev_state_exit(spa, vd, 0); + goto top; + } + ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); + } + + /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level @@ -2013,28 +2289,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } + + /* + * Add the device back into the metaslab rotor so that + * once we online the device it's open for business. + */ + if (tvd->vdev_islog && mg != NULL) + metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - if (!tvd->vdev_islog || !vdev_is_dead(tvd)) - return (spa_vdev_state_exit(spa, vd, 0)); + return (spa_vdev_state_exit(spa, vd, 0)); +} - (void) spa_vdev_state_exit(spa, vd, 0); +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + int error; - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN); - if (error) { - (void) vdev_online(spa, guid, 0, NULL); - return (error); - } - /* - * If we successfully offlined the log device then we need to - * sync out the current txg so that the "stubby" block can be - * removed by zil_sync(). - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - return (0); + mutex_enter(&spa->spa_vdev_top_lock); + error = vdev_offline_locked(spa, guid, flags); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); } /* @@ -2068,13 +2346,22 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { - vd->vdev_faulted = vd->vdev_degraded = 0; + /* + * When reopening in reponse to a clear event, it may be due to + * a fmadm repair request. In this case, if the device is + * still broken, we want to still post the ereport again. + */ + vd->vdev_forcefault = B_TRUE; + + vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; - vdev_reopen(vd); + vdev_reopen(vd == rvd ? rvd : vd->vdev_top); - if (vd != rvd) + vd->vdev_forcefault = B_FALSE; + + if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) @@ -2082,12 +2369,30 @@ vdev_clear(spa_t *spa, vdev_t *vd) spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); } + + /* + * When clearing a FMA-diagnosed fault, we always want to + * unspare the device, as we assume that the original spare was + * done in response to the FMA fault. + */ + if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state < VDEV_STATE_DEGRADED); + /* + * Holes and missing devices are always considered "dead". + * This simplifies the code since we don't have to check for + * these types of devices in the various code paths. + * Instead we rely on the fact that we skip over dead devices + * before issuing I/O to them. + */ + return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || + vd->vdev_ops == &vdev_missing_ops); } boolean_t @@ -2116,7 +2421,7 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write); + !vd->vdev_cant_write && !vd->vdev_ishole); } boolean_t @@ -2146,7 +2451,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); @@ -2168,7 +2472,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } - vs->vs_scrub_examined += cvs->vs_scrub_examined; + cvs->vs_scan_removing = cvd->vdev_removing; mutex_exit(&vd->vdev_stat_lock); } } @@ -2185,6 +2489,19 @@ vdev_clear_stats(vdev_t *vd) } void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; @@ -2228,8 +2545,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += psize; + if (flags & ZIO_FLAG_SCAN_THREAD) { + dsl_scan_phys_t *scn_phys = + &spa->spa_dsl_pool->dp_scan->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + /* XXX cleanup? */ + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -2254,6 +2580,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize) !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; + /* + * Intent logs writes won't propagate their error to the root + * I/O so don't mark these types of failures as pool-level + * errors. + */ + if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + return; + mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) @@ -2267,14 +2601,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCRUB_THREAD))) { + (flags & ZIO_FLAG_SCAN_THREAD) || + spa->spa_claiming)) { /* - * This is either a normal write (not a repair), or it's a - * repair induced by the scrub thread. In the normal case, - * we commit the DTL change in the same txg as the block - * was born. In the scrub-induced repair case, we know that - * scrubs run in first-pass syncing context, so we commit - * the DTL change in spa->spa_syncing_txg. + * This is either a normal write (not a repair), or it's + * a repair induced by the scrub thread, or it's a repair + * made by zil_claim() during spa_load() in the first txg. + * In the normal case, we commit the DTL change in the same + * txg as the block was born. In the scrub-induced repair + * case, we know that scrubs run in first-pass syncing context, + * so we commit the DTL change in spa_syncing_txg(spa). + * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) @@ -2283,13 +2620,16 @@ vdev_stat_update(zio_t *zio, uint64_t psize) */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); - commit_txg = spa->spa_syncing_txg; + commit_txg = spa_syncing_txg(spa); + } else if (spa->spa_claiming) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + commit_txg = spa_first_txg(spa); } - ASSERT(commit_txg >= spa->spa_syncing_txg); + ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) @@ -2301,45 +2641,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } -void -vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) -{ - vdev_stat_t *vs = &vd->vdev_stat; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_scrub_stat_update(vd->vdev_child[c], type, complete); - - mutex_enter(&vd->vdev_stat_lock); - - if (type == POOL_SCRUB_NONE) { - /* - * Update completion and end time. Leave everything else alone - * so we can report what happened during the previous scrub. - */ - vs->vs_scrub_complete = complete; - vs->vs_scrub_end = gethrestime_sec(); - } else { - vs->vs_scrub_type = type; - vs->vs_scrub_complete = 0; - vs->vs_scrub_examined = 0; - vs->vs_scrub_repaired = 0; - vs->vs_scrub_start = gethrestime_sec(); - vs->vs_scrub_end = 0; - } - - mutex_exit(&vd->vdev_stat_lock); -} - /* - * Update the in-core space usage stats for this vdev and the root vdev. + * Update the in-core space usage stats for this vdev, its metaslab class, + * and the root vdev. */ void -vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, - boolean_t update_root) +vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -2355,28 +2669,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (update_root) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - /* - * Don't count non-normal (e.g. intent log) space as part of - * the pool's capacity. - */ - if (vd->vdev_mg->mg_class != spa->spa_normal_class) - return; - + if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); - rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } + + if (mc != NULL) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, + alloc_delta, defer_delta, space_delta, dspace_delta); + } } /* @@ -2391,6 +2703,8 @@ vdev_config_dirty(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int c; + ASSERT(spa_writeable(spa)); + /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. @@ -2428,7 +2742,7 @@ vdev_config_dirty(vdev_t *vd) * sketchy, but it will work. */ nvlist_free(aux[c]); - aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } @@ -2449,7 +2763,8 @@ vdev_config_dirty(vdev_t *vd) } else { ASSERT(vd == vd->vdev_top); - if (!list_link_active(&vd->vdev_config_dirty_node)) + if (!list_link_active(&vd->vdev_config_dirty_node) && + !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } @@ -2478,6 +2793,7 @@ vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* @@ -2490,7 +2806,7 @@ vdev_state_dirty(vdev_t *vd) (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); - if (!list_link_active(&vd->vdev_state_dirty_node)) + if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } @@ -2523,6 +2839,12 @@ vdev_propagate_state(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; + /* + * Don't factor holes into the decision. + */ + if (child->vdev_ishole) + continue; + if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* @@ -2586,15 +2908,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) /* * If we are setting the vdev state to anything but an open state, then - * always close the underlying device. Otherwise, we keep accessible - * but invalid devices open forever. We don't call vdev_close() itself, - * because that implies some extra checks (offline, etc) that we don't - * want here. This is limited to leaf devices, because otherwise - * closing the device will affect other children. + * always close the underlying device unless the device has requested + * a delayed close (i.e. we're about to remove or fault the device). + * Otherwise, we keep accessible but invalid devices open forever. + * We don't call vdev_close() itself, because that implies some extra + * checks (offline, etc) that we don't want here. This is limited to + * leaf devices, because otherwise closing the device will affect other + * children. */ - if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + if (!vd->vdev_delayed_close && vdev_is_dead(vd) && + vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); + /* + * If we have brought this vdev back into service, we need + * to notify fmd so that it can gracefully repair any outstanding + * cases due to a missing device. We do this in all cases, even those + * that probably don't correlate to a repaired fault. This is sure to + * catch all cases, and we let the zfs-retire agent sort it out. If + * this is a transient state it's OK, as the retire agent will + * double-check the state of the vdev before repairing it. + */ + if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && + vd->vdev_prevstate != state) + zfs_post_state_change(spa, vd); + if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { @@ -2610,20 +2948,16 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { - /* - * Indicate to the ZFS DE that this device has been removed, and - * any recent errors should be ignored. - */ - zfs_post_remove(spa, vd); vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* - * If we fail to open a vdev during an import, we mark it as - * "not available", which signifies that it was never there to - * begin with. Failure to open such a device is not considered - * an error. + * If we fail to open a vdev during an import or recovery, we + * mark it as "not available", which signifies that it was + * never there to begin with. Failure to open such a device + * is not considered an error. */ - if (spa->spa_load_state == SPA_LOAD_IMPORT && + if ((spa_load_state(spa) == SPA_LOAD_IMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2666,9 +3000,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; - case VDEV_AUX_IO_FAILURE: - class = FM_EREPORT_ZFS_IO_FAILURE; - break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } @@ -2716,36 +3047,55 @@ vdev_is_bootable(vdev_t *vd) return (B_TRUE); } +/* + * Load the state from the original vdev tree (ovd) which + * we've retrieved from the MOS config object. If the original + * vdev was offline or faulted then we transfer that state to the + * device in the current vdev tree (nvd). + */ void -vdev_load_log_state(vdev_t *vd, nvlist_t *nv) +vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { - uint_t children; - nvlist_t **child; - uint64_t val; - spa_t *spa = vd->vdev_spa; + spa_t *spa = nvd->vdev_spa; - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (int c = 0; c < children; c++) - vdev_load_log_state(vd->vdev_child[c], child[c]); - } + ASSERT(nvd->vdev_top->vdev_islog); + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); - if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv, - ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) { + for (int c = 0; c < nvd->vdev_children; c++) + vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); + if (nvd->vdev_ops->vdev_op_leaf) { /* - * It would be nice to call vdev_offline() - * directly but the pool isn't fully loaded and - * the txg threads have not been started yet. + * Restore the persistent vdev state */ - spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); - vd->vdev_offline = val; - vdev_reopen(vd->vdev_top); - spa_config_exit(spa, SCL_STATE_ALL, FTAG); + nvd->vdev_offline = ovd->vdev_offline; + nvd->vdev_faulted = ovd->vdev_faulted; + nvd->vdev_degraded = ovd->vdev_degraded; + nvd->vdev_removed = ovd->vdev_removed; } } /* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* * Expand a vdev if possible. */ void @@ -2759,3 +3109,22 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_config_dirty(vd); } } + +/* + * Split a vdev. + */ +void +vdev_split(vdev_t *vd) +{ + vdev_t *cvd, *pvd = vd->vdev_parent; + + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + cvd = pvd->vdev_child[0]; + if (pvd->vdev_children == 1) { + vdev_remove_parent(cvd); + cvd->vdev_splitting = B_TRUE; + } + vdev_propagate_state(cvd); +}