X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fspa.c;h=9ae16d316367da89dc36ae5048426b54753ba876;hb=e5dc681a50a41d42c18b52abc8d5a3c2a357bdaa;hp=d7c5de0d357a2fab7e3a281ed52196aab97519c8;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d7c5de0..9ae16d3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -116,6 +116,7 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); +static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ id_t zio_taskq_psrset_bind = PS_NONE; @@ -180,6 +181,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); @@ -529,7 +532,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) nvpair_name(elem))) == ZPROP_INVAL) return (EINVAL); - if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + if (prop == ZPOOL_PROP_CACHEFILE || + prop == ZPOOL_PROP_ALTROOT || + prop == ZPOOL_PROP_READONLY) continue; need_sync = B_TRUE; @@ -648,8 +653,10 @@ spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, static void spa_create_zio_taskqs(spa_t *spa) { - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + int t, q; + + for (t = 0; t < ZIO_TYPES; t++) { + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; @@ -803,6 +810,8 @@ spa_activate(spa_t *spa, int mode) static void spa_deactivate(spa_t *spa) { + int t, q; + ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); @@ -814,8 +823,8 @@ spa_deactivate(spa_t *spa) list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_state_dirty_list); - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + for (t = 0; t < ZIO_TYPES; t++) { + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { if (spa->spa_zio_taskq[t][q] != NULL) taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; @@ -878,6 +887,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, nvlist_t **child; uint_t children; int error; + int c; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); @@ -897,7 +907,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (EINVAL); } - for (int c = 0; c < children; c++) { + for (c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { @@ -1274,7 +1284,9 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - for (int c = 0; c < vd->vdev_children; c++) + int c; + + for (c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { @@ -1284,33 +1296,132 @@ spa_check_removed(vdev_t *vd) } /* - * Load the slog device state from the config object since it's possible - * that the label does not contain the most up-to-date information. + * Validate the current config against the MOS config */ -void -spa_load_log_state(spa_t *spa, nvlist_t *nv) +static boolean_t +spa_config_valid(spa_t *spa, nvlist_t *config) { - vdev_t *ovd, *rvd = spa->spa_root_vdev; + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv; + int c, i; + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); /* - * Load the original root vdev tree from the passed config. + * If we're doing a normal import, then build up any additional + * diagnostic information about missing devices in this config. + * We'll pass this up to the user for further processing. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops && + mtvd->vdev_islog) + child[idx++] = vdev_config_generate(spa, mtvd, + B_FALSE, 0); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + + for (i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + } + + /* + * Compare the root vdev tree with the information we have + * from the MOS config (mrvd). Check each top-level vdev + * with the corresponding MOS config top-level (mtvd). + */ + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - if (cvd->vdev_islog) - vdev_load_log_state(cvd, ovd->vdev_child[c]); + /* + * Resolve any "missing" vdevs in the current configuration. + * If we find that the MOS config has more accurate information + * about the top-level vdev then use that vdev instead. + */ + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) { + + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) + continue; + + /* + * Device specific actions. + */ + if (mtvd->vdev_islog) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * XXX - once we have 'readonly' pool + * support we should be able to handle + * missing data devices by transitioning + * the pool to readonly. + */ + continue; + } + + /* + * Swap the missing vdev with the data we were + * able to obtain from the MOS config. + */ + vdev_remove_child(rvd, tvd); + vdev_remove_child(mrvd, mtvd); + + vdev_add_child(rvd, mtvd); + vdev_add_child(mrvd, tvd); + + spa_config_exit(spa, SCL_ALL, FTAG); + vdev_load(mtvd); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + vdev_reopen(rvd); + } else if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS config + * since it's possible that the label does not + * contain the most up-to-date information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } } - vdev_free(ovd); + vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Ensure we were able to validate the config. + */ + return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ -int +static int spa_check_logs(spa_t *spa) { switch (spa->spa_log_state) { @@ -1332,13 +1443,14 @@ spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; + int c; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); - for (int c = 0; c < rvd->vdev_children; c++) { + for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -1355,10 +1467,11 @@ static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; + int c; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - for (int c = 0; c < rvd->vdev_children; c++) { + for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -1388,7 +1501,9 @@ spa_offline_log(spa_t *spa) static void spa_aux_check_removed(spa_aux_vdev_t *sav) { - for (int i = 0; i < sav->sav_count; i++) + int i; + + for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } @@ -1474,9 +1589,19 @@ spa_load_verify(spa_t *spa) if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { + int64_t loss = 0; + verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } @@ -1635,13 +1760,21 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, KM_SLEEP) == 0); } + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + if (error) { + if (error != EEXIST) { + spa->spa_loaded_ts.tv_sec = 0; + spa->spa_loaded_ts.tv_nsec = 0; + } + if (error != EBADF) { + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + } + } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; @@ -1661,7 +1794,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_t *nvroot = NULL; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; + uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; @@ -1760,9 +1893,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. + * incomplete configuration. We first check to see if the pool + * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). + * If it is, defer the vdev_guid_sum check till later so we + * can handle missing vdevs. */ - if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -1982,13 +2119,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_config_exit(spa, SCL_ALL, FTAG); /* - * Check the state of the root vdev. If it can't be opened, it - * indicates one or more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (ENXIO); - - /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); @@ -1997,16 +2127,12 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_update_dspace(spa); - if (state != SPA_LOAD_TRYIMPORT) { - error = spa_load_verify(spa); - if (error) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } - /* - * Load the intent log state and check log integrity. If we're - * assembling a pool from a split, the log is not transferred over. + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; @@ -2014,21 +2140,42 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - spa_load_log_state(spa, nvroot); + if (!spa_config_valid(spa, nvconfig)) { + nvlist_free(nvconfig); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } nvlist_free(nvconfig); + /* + * Now that we've validate the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + if (spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (state != SPA_LOAD_TRYIMPORT) { + if (error = spa_load_verify(spa)) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; + int c; ASSERT(state != SPA_LOAD_TRYIMPORT); @@ -2066,15 +2213,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * - * If spa_load_verbatim is true, trust the current + * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || - state == SPA_LOAD_RECOVER) + state == SPA_LOAD_IMPORT || + state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; - for (int c = 0; c < rvd->vdev_children; c++) + for (c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; @@ -2110,12 +2258,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { + int mode = spa->spa_mode; + spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg--; - spa_activate(spa, spa_mode_global); + spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); @@ -2173,9 +2323,6 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, rewind_error = spa_load_retry(spa, state, mosconfig); } - if (config) - spa_rewind_data_to_nvlist(spa, config); - spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; @@ -2202,6 +2349,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; + spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; @@ -2225,7 +2373,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - spa_load_state_t state = SPA_LOAD_OPEN; zpool_rewind_policy_t policy; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, @@ -2264,9 +2411,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_config) + if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; @@ -2275,15 +2426,22 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, *spapp = NULL; return (error); } - } spa_open_ref(spa, tag); - if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; @@ -2459,6 +2617,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { + uint64_t loadtimes[2]; + + loadtimes[0] = spa->spa_loaded_ts.tv_sec; + loadtimes[1] = spa->spa_loaded_ts.tv_nsec; + VERIFY(nvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); @@ -2693,6 +2858,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; + int c; /* * If this pool already exists, return failure. @@ -2751,7 +2917,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (int c = 0; c < rvd->vdev_children; c++) { + for (c = 0; c < rvd->vdev_children; c++) { vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } @@ -2951,7 +3117,9 @@ spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { - for (int c = 0; c < vd->vdev_children; c++) + int c; + + for (c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { @@ -3032,7 +3200,7 @@ spa_import_rootpool(char *devpath, char *devid) spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; - spa->spa_load_verbatim = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. @@ -3081,7 +3249,8 @@ spa_import_rootpool(char *devpath, char *devid) !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", - bvd->vdev_parent->vdev_child[1]->vdev_path); + bvd->vdev_parent-> + vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = EINVAL; goto out; } @@ -3101,48 +3270,17 @@ out: #endif /* - * Take a pool and insert it into the namespace as if it had been loaded at - * boot. - */ -int -spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) -{ - spa_t *spa; - char *altroot = NULL; - - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, config, altroot); - - spa->spa_load_verbatim = B_TRUE; - - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); - - spa_config_sync(spa, B_FALSE, B_TRUE); - - mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); - - return (0); -} - -/* * Import a non-root pool into the system. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; + uint64_t mode = spa_mode_global; + uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; @@ -3157,23 +3295,45 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) return (EEXIST); } - zpool_get_rewind_policy(config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) + mode = FREAD; spa = spa_add(pool, config, altroot); - spa_activate(spa, spa_mode_global); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); + + return (0); + } + + spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when @@ -3181,14 +3341,16 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* - * Propagate anything learned about failing or best txgs - * back to caller + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). */ - spa_rewind_data_to_nvlist(spa, config); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -3228,6 +3390,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) return (error); } + spa_async_resume(spa); + /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -3278,8 +3442,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } - spa_async_resume(spa); - /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. @@ -3541,6 +3703,9 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + int c; + + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -3575,7 +3740,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) /* * Transfer each new top-level vdev from vd to rvd. */ - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. @@ -3653,6 +3818,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) int newvd_isspare; int error; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3702,7 +3869,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && - pvd->vdev_child[1] == oldvd && + oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -3714,13 +3881,15 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * the same (spare replaces spare, non-spare replaces * non-spare). */ - if (pvd->vdev_ops == &vdev_replacing_ops) + if (pvd->vdev_ops == &vdev_replacing_ops && + spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops == &vdev_spare_ops && - newvd->vdev_isspare != oldvd->vdev_isspare) + } else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops != &vdev_spare_ops && - newvd->vdev_isspare) + } + + if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; @@ -3755,6 +3924,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } } + /* mark the device being resilvered */ + newvd->vdev_resilvering = B_TRUE; + /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -3823,6 +3995,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvdpath); spa_strfree(newvdpath); + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); + return (0); } @@ -3840,8 +4015,10 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid; - size_t len; char *vdpath; + int c, t; + + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -3872,18 +4049,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* - * If replace_done is specified, only remove this device if it's - * the first child of a replacing vdev. For the 'spare' vdev, either - * disk can be removed. + * Only 'replacing' or 'spare' vdevs can be replaced. */ - if (replace_done) { - if (pvd->vdev_ops == &vdev_replacing_ops) { - if (vd->vdev_id != 0) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } else if (pvd->vdev_ops != &vdev_spare_ops) { - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } - } + if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); @@ -3910,16 +4080,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ - if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && - pvd->vdev_child[0]->vdev_path != NULL && - pvd->vdev_child[1]->vdev_path != NULL) { - ASSERT(pvd->vdev_child[1] == vd); - cvd = pvd->vdev_child[0]; - len = strlen(vd->vdev_path); - if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && - strcmp(cvd->vdev_path + len, "/old") == 0) { - spa_strfree(cvd->vdev_path); - cvd->vdev_path = spa_strdup(vd->vdev_path); + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && + vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path); + + for (c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + + if (cvd == vd || cvd->vdev_path == NULL) + continue; + + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + break; + } } } @@ -3929,7 +4105,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) + vd->vdev_id == 0 && + pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* @@ -3951,7 +4128,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) /* * Remember one of the remaining children so we can get tvd below. */ - cvd = pvd->vdev_child[0]; + cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, @@ -3967,14 +4144,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ - if (pvd->vdev_children == 1) + if (pvd->vdev_children == 1) { + if (pvd->vdev_ops == &vdev_spare_ops) + cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); + cvd->vdev_resilvering = B_FALSE; + } + /* * We don't set tvd until now because the parent we just removed @@ -4009,13 +4192,16 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); - for (int t = 0; t < TXG_SIZE; t++) + for (t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, @@ -4028,24 +4214,31 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * list of every other pool. */ if (unspare) { - spa_t *myspa = spa; - spa = NULL; + spa_t *altspa = NULL; + mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) + while ((altspa = spa_next(altspa)) != NULL) { + if (altspa->spa_state != POOL_STATE_ACTIVE || + altspa == spa) continue; - if (spa == myspa) - continue; - spa_open_ref(spa, FTAG); + + spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, - B_TRUE); + (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); + spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); + + /* search the rest of the vdevs for spares to remove */ + spa_vdev_resilver_done(spa); } + /* all done with the spa; OK to release */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + return (error); } @@ -4066,8 +4259,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; - if (!spa_writeable(spa)) - return (EROFS); + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -4335,7 +4527,9 @@ out: static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { - for (int i = 0; i < count; i++) { + int i; + + for (i = 0; i < count; i++) { uint64_t guid; VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, @@ -4353,11 +4547,12 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; + int i, j; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); - for (int i = 0, j = 0; i < count; i++) { + for (i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); @@ -4366,7 +4561,7 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); - for (int i = 0; i < count - 1; i++) + for (i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) @@ -4484,6 +4679,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + ASSERT(spa_writeable(spa)); + if (!locked) txg = spa_vdev_enter(spa); @@ -4585,19 +4782,27 @@ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; + int c; - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* - * Check for a completed replacement. + * Check for a completed replacement. We always consider the first + * vdev in the list to be the oldest vdev, and the last one to be + * the newest (see spa_vdev_attach() for how that works). In + * the case where the newest vdev is faulted, we will not automatically + * remove it after a resilver completes. This is OK as it will require + * user intervention to determine which disk the admin wishes to keep. */ - if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + if (vd->vdev_ops == &vdev_replacing_ops) { + ASSERT(vd->vdev_children > 1); + + newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; - newvd = vd->vdev_child[1]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && @@ -4608,16 +4813,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) /* * Check for a completed resilver with the 'unspare' flag set. */ - if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { - newvd = vd->vdev_child[0]; - oldvd = vd->vdev_child[1]; + if (vd->vdev_ops == &vdev_spare_ops) { + vdev_t *first = vd->vdev_child[0]; + vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; - if (newvd->vdev_unspare && + if (last->vdev_unspare) { + oldvd = first; + newvd = last; + } else if (first->vdev_unspare) { + oldvd = last; + newvd = first; + } else { + oldvd = NULL; + } + + if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && - !vdev_dtl_required(oldvd)) { - newvd->vdev_unspare = 0; + !vdev_dtl_required(oldvd)) return (oldvd); + + /* + * If there are more than two spares attached to a disk, + * and those spares are not required, then we want to + * attempt to free them up now so that they can be used + * by other pools. Once we're back down to a single + * disk+spare, we stop removing them. + */ + if (vd->vdev_children > 2) { + newvd = vd->vdev_child[1]; + + if (newvd->vdev_isspare && last->vdev_isspare && + vdev_dtl_empty(last, DTL_MISSING) && + vdev_dtl_empty(last, DTL_OUTAGE) && + !vdev_dtl_required(newvd)) + return (newvd); } } @@ -4644,9 +4874,9 @@ spa_vdev_resilver_done(spa_t *spa) * we need to detach the parent's first child (the original hot * spare) as well. */ - if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && + ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(ppvd->vdev_children == 2); sguid = ppvd->vdev_child[1]->vdev_guid; } spa_config_exit(spa, SCL_ALL, FTAG); @@ -4670,6 +4900,8 @@ spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, vdev_t *vd; boolean_t sync = B_FALSE; + ASSERT(spa_writeable(spa)); + spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) @@ -4755,6 +4987,8 @@ spa_scan(spa_t *spa, pool_scan_func_t func) static void spa_async_remove(spa_t *spa, vdev_t *vd) { + int c; + if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; @@ -4773,19 +5007,21 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vdev_state_dirty(vd->vdev_top); } - for (int c = 0; c < vd->vdev_children; c++) + for (c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { + int c; + if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } - for (int c = 0; c < vd->vdev_children; c++) + for (c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } @@ -4795,11 +5031,12 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) sysevent_id_t eid; nvlist_t *attr; char *physpath; + int c; if (!spa->spa_autoexpand) return; - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } @@ -4823,7 +5060,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) static void spa_async_thread(spa_t *spa) { - int tasks; + int tasks, i; ASSERT(spa->spa_sync_on); @@ -4862,9 +5099,9 @@ spa_async_thread(spa_t *spa) if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); - for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + for (i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); - for (int i = 0; i < spa->spa_spares.sav_count; i++) + for (i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -5115,9 +5352,11 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(spa->spa_root != NULL); break; + case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is also a non-persisitent property. + * 'readonly' and 'cachefile' are also non-persisitent + * properties. */ break; default: @@ -5248,6 +5487,9 @@ spa_sync(spa_t *spa, uint64_t txg) vdev_t *vd; dmu_tx_t *tx; int error; + int c; + + VERIFY(spa_writeable(spa)); /* * Lock out configuration changes. @@ -5379,7 +5621,7 @@ spa_sync(spa_t *spa, uint64_t txg) int children = rvd->vdev_children; int c0 = spa_get_random(children); - for (int c = 0; c < children; c++) { + for (c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; @@ -5467,7 +5709,8 @@ spa_sync_allpools(void) spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -5547,6 +5790,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) void spa_upgrade(spa_t *spa, uint64_t version) { + ASSERT(spa_writeable(spa)); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /*