+ return (verify_ok ? 0 : EIO);
+}
+
+/*
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
+ */
+static int
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+{
+ return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, val));
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ return (err);
+}
+
+/*
+ * Fix up config after a partly-completed split. This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process. To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call. If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+ uint_t extracted;
+ uint64_t *glist;
+ uint_t i, gcount;
+ nvlist_t *nvl;
+ vdev_t **vd;
+ boolean_t attempt_reopen;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ return;
+
+ /* check that the config is complete */
+ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ &glist, &gcount) != 0)
+ return;
+
+ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_PUSHPAGE);
+
+ /* attempt to online all the vdevs & validate */
+ attempt_reopen = B_TRUE;
+ for (i = 0; i < gcount; i++) {
+ if (glist[i] == 0) /* vdev is hole */
+ continue;
+
+ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ if (vd[i] == NULL) {
+ /*
+ * Don't bother attempting to reopen the disks;
+ * just do the split.
+ */
+ attempt_reopen = B_FALSE;
+ } else {
+ /* attempt to re-online it */
+ vd[i]->vdev_offline = B_FALSE;
+ }
+ }
+
+ if (attempt_reopen) {
+ vdev_reopen(spa->spa_root_vdev);
+
+ /* check each device to see what state it's in */
+ for (extracted = 0, i = 0; i < gcount; i++) {
+ if (vd[i] != NULL &&
+ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ break;
+ ++extracted;
+ }
+ }
+
+ /*
+ * If every disk has been moved to the new pool, or if we never
+ * even attempted to look at them, then we split them off for
+ * good.
+ */
+ if (!attempt_reopen || gcount == extracted) {
+ for (i = 0; i < gcount; i++)
+ if (vd[i] != NULL)
+ vdev_split(vd[i]);
+ vdev_reopen(spa->spa_root_vdev);
+ }
+
+ kmem_free(vd, gcount * sizeof (vdev_t *));
+}
+
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+ boolean_t mosconfig)
+{
+ nvlist_t *config = spa->spa_config;
+ char *ereport = FM_EREPORT_ZFS_POOL;
+ char *comment;
+ int error;
+ uint64_t pool_guid;
+ nvlist_t *nvl;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+ return (EINVAL);
+
+ ASSERT(spa->spa_comment == NULL);
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ spa->spa_comment = spa_strdup(comment);
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+ error = EEXIST;
+ } else {
+ spa->spa_config_guid = pool_guid;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ &nvl) == 0) {
+ VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+ KM_PUSHPAGE) == 0);
+ }
+
+ nvlist_free(spa->spa_load_info);
+ spa->spa_load_info = fnvlist_alloc();
+
+ gethrestime(&spa->spa_loaded_ts);
+ error = spa_load_impl(spa, pool_guid, config, state, type,
+ mosconfig, &ereport);
+ }
+
+ spa->spa_minref = refcount_count(&spa->spa_refcount);
+ if (error) {
+ if (error != EEXIST) {
+ spa->spa_loaded_ts.tv_sec = 0;
+ spa->spa_loaded_ts.tv_nsec = 0;
+ }
+ if (error != EBADF) {
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ }
+ }
+ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+__attribute__((always_inline))
+static inline int
+spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ char **ereport)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ nvlist_t *label;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t children, config_cache_txg = spa->spa_config_txg;
+ int orig_mode = spa->spa_mode;
+ int parse;
+ uint64_t obj;
+ boolean_t missing_feat_write = B_FALSE;
+
+ /*
+ * If this is an untrusted config, access the pool in read-only mode.
+ * This prevents things like resilvering recently removed devices.
+ */
+ if (!mosconfig)
+ spa->spa_mode = FREAD;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+ return (EINVAL);
+
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+
+ /*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0)
+ return (error);
+
+ ASSERT(spa->spa_root_vdev == rvd);
+
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_guid(spa) == pool_guid);
+ }
+
+ /*
+ * Try to open all vdevs, loading each label in the process.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_open(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error != 0)
+ return (error);
+
+ /*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand, which is dependent on the setting of mosconfig. If
+ * mosconfig is true then we're validating the vdev labels based on
+ * that config. Otherwise, we're validating against the cached config
+ * (zpool.cache) that was read when we loaded the zfs module, and then
+ * later we will recursively call spa_load() and validate against
+ * the vdev config.
+ *
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd, mosconfig);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0)
+ return (error);
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ vdev_uberblock_load(rvd, ub, &label);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ nvlist_free(label);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+ }
+
+ /*
+ * If the pool has an unsupported version we can't open it.
+ */
+ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+ nvlist_free(label);
+ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+ }
+
+ if (ub->ub_version >= SPA_VERSION_FEATURES) {
+ nvlist_t *features;
+
+ /*
+ * If we weren't able to find what's necessary for reading the
+ * MOS in the label, return failure.
+ */
+ if (label == NULL || nvlist_lookup_nvlist(label,
+ ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
+ nvlist_free(label);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ ENXIO));
+ }
+
+ /*
+ * Update our in-core representation with the definitive values
+ * from the label.
+ */
+ nvlist_free(spa->spa_label_features);
+ VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Look through entries in the label nvlist's features_for_read. If
+ * there is a feature listed there which we don't understand then we
+ * cannot open a pool.
+ */
+ if (ub->ub_version >= SPA_VERSION_FEATURES) {
+ nvlist_t *unsup_feat;
+ nvpair_t *nvp;
+
+ VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+ 0);
+
+ for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL);
+ nvp != NULL;
+ nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+ if (!zfeature_is_supported(nvpair_name(nvp))) {
+ VERIFY(nvlist_add_string(unsup_feat,
+ nvpair_name(nvp), "") == 0);
+ }
+ }
+
+ if (!nvlist_empty(unsup_feat)) {
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+ nvlist_free(unsup_feat);
+ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+
+ nvlist_free(unsup_feat);
+ }
+
+ /*
+ * If the vdev guid sum doesn't match the uberblock, we have an
+ * incomplete configuration. We first check to see if the pool
+ * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
+ * If it is, defer the vdev_guid_sum check till later so we
+ * can handle missing vdevs.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ rvd->vdev_guid_sum != ub->ub_guid_sum)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_try_repair(spa, config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+
+ error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+ boolean_t missing_feat_read = B_FALSE;
+ nvlist_t *unsup_feat, *enabled_feat;
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+ &spa->spa_feat_for_read_obj) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+ &spa->spa_feat_for_write_obj) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+ &spa->spa_feat_desc_obj) != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ enabled_feat = fnvlist_alloc();
+ unsup_feat = fnvlist_alloc();
+
+ if (!feature_is_supported(spa->spa_meta_objset,
+ spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
+ unsup_feat, enabled_feat))
+ missing_feat_read = B_TRUE;
+
+ if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
+ if (!feature_is_supported(spa->spa_meta_objset,
+ spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
+ unsup_feat, enabled_feat)) {
+ missing_feat_write = B_TRUE;
+ }
+ }
+
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+ if (!nvlist_empty(unsup_feat)) {
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+ }
+
+ fnvlist_free(enabled_feat);
+ fnvlist_free(unsup_feat);
+
+ if (!missing_feat_read) {
+ fnvlist_add_boolean(spa->spa_load_info,
+ ZPOOL_CONFIG_CAN_RDONLY);
+ }
+
+ /*
+ * If the state is SPA_LOAD_TRYIMPORT, our objective is
+ * twofold: to determine whether the pool is available for
+ * import in read-write mode and (if it is not) whether the
+ * pool is available for import in read-only mode. If the pool
+ * is available for import in read-write mode, it is displayed
+ * as available in userland; if it is not available for import
+ * in read-only mode, it is displayed as unavailable in
+ * userland. If the pool is available for import in read-only
+ * mode but not read-write mode, it is displayed as unavailable
+ * in userland with a special note that the pool is actually
+ * available for open in read-only mode.
+ *
+ * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+ * missing a feature for write, we must first determine whether
+ * the pool can be opened read-only before returning to
+ * userland in order to know whether to display the
+ * abovementioned note.
+ */
+ if (missing_feat_read || (missing_feat_write &&
+ spa_writeable(spa))) {
+ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP));
+ }
+ }
+
+ spa->spa_is_initializing = B_TRUE;
+ error = dsl_pool_open(spa->spa_dsl_pool);
+ spa->spa_is_initializing = B_FALSE;
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!mosconfig) {
+ uint64_t hostid;
+ nvlist_t *policy = NULL, *nvconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
+ ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+ char *hostname;
+ unsigned long myhostid = 0;
+
+ VERIFY(nvlist_lookup_string(nvconfig,
+ ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
+
+#ifdef _KERNEL
+ myhostid = zone_get_hostid(NULL);
+#else /* _KERNEL */
+ /*
+ * We're emulating the system's hostid in userland, so
+ * we can't use zone_get_hostid().
+ */
+ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
+#endif /* _KERNEL */
+ if (hostid != 0 && myhostid != 0 &&
+ hostid != myhostid) {
+ nvlist_free(nvconfig);
+ cmn_err(CE_WARN, "pool '%s' could not be "
+ "loaded as it was last accessed by "
+ "another system (host: %s hostid: 0x%lx). "
+ "See: http://zfsonlinux.org/msg/ZFS-8000-EY",
+ spa_name(spa), hostname,
+ (unsigned long)hostid);
+ return (EBADF);
+ }
+ }
+ if (nvlist_lookup_nvlist(spa->spa_config,
+ ZPOOL_REWIND_POLICY, &policy) == 0)
+ VERIFY(nvlist_add_nvlist(nvconfig,
+ ZPOOL_REWIND_POLICY, policy) == 0);
+
+ spa_config_set(spa, nvconfig);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa, orig_mode);
+
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
+ }
+
+ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the bit that tells us to use the new accounting function
+ * (raid-z deflation). If we have an older pool, this will not
+ * be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+ &spa->spa_creation_version);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
+ */
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+ &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ /*