Illumos #3090 and #3102
authorGeorge Wilson <george.wilson@delphix.com>
Fri, 14 Dec 2012 20:38:04 +0000 (12:38 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 8 Jan 2013 18:35:42 +0000 (10:35 -0800)
3090 vdev_reopen() during reguid causes vdev to be treated as corrupt
3102 vdev_uberblock_load() and vdev_validate() may read the wrong label

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Eric Schrock <Eric.Schrock@delphix.com>

References:
  illumos/illumos-gate@dfbb943217bf8ab22a1a9d2e9dca01d4da95ee0b
  illumos changeset: 13777:b1e53580146d
  https://www.illumos.org/issues/3090
  https://www.illumos.org/issues/3102

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #939

cmd/ztest/ztest.c
include/sys/fs/zfs.h
include/sys/spa_impl.h
include/sys/vdev.h
lib/libzfs/libzfs_import.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_label.c

index 0dace49..9b7adc7 100644 (file)
@@ -365,7 +365,7 @@ ztest_info_t ztest_info[] = {
        { ztest_spa_rename,                     1,      &zopt_rarely    },
        { ztest_scrub,                          1,      &zopt_rarely    },
        { ztest_dsl_dataset_promote_busy,       1,      &zopt_rarely    },
-       { ztest_vdev_attach_detach,             1,      &zopt_rarely },
+       { ztest_vdev_attach_detach,             1,      &zopt_rarely    },
        { ztest_vdev_LUN_growth,                1,      &zopt_rarely    },
        { ztest_vdev_add_remove,                1,
            &ztest_opts.zo_vdevtime                             },
@@ -416,6 +416,13 @@ static spa_t *ztest_spa = NULL;
 static ztest_ds_t *ztest_ds;
 
 static kmutex_t ztest_vdev_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
 static krwlock_t ztest_name_lock;
 
 static boolean_t ztest_dump_core = B_TRUE;
@@ -5034,10 +5041,16 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
 {
        spa_t *spa = ztest_spa;
        uint64_t orig, load;
+       int error;
 
        orig = spa_guid(spa);
        load = spa_load_guid(spa);
-       if (spa_change_guid(spa) != 0)
+
+       (void) rw_enter(&ztest_name_lock, RW_WRITER);
+       error = spa_change_guid(spa);
+       (void) rw_exit(&ztest_name_lock);
+
+       if (error != 0)
                return;
 
        if (ztest_opts.zo_verbose >= 3) {
@@ -5732,6 +5745,12 @@ ztest_freeze(void)
        VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
        VERIFY3U(0, ==, ztest_dataset_open(0));
        ztest_dataset_close(0);
+
+       spa->spa_debug = B_TRUE;
+       ztest_spa = spa;
+       txg_wait_synced(spa_get_dsl(spa), 0);
+       ztest_reguid(NULL, 0);
+
        spa_close(spa, FTAG);
        kernel_fini();
 }
index 61596f7..800eb77 100644 (file)
@@ -954,6 +954,7 @@ typedef enum history_internal_events {
        LOG_DS_USER_HOLD,
        LOG_DS_USER_RELEASE,
        LOG_POOL_SPLIT,
+       LOG_POOL_GUID_CHANGE,
        LOG_END
 } history_internal_events_t;
 
index 85a825d..65edc97 100644 (file)
@@ -141,6 +141,7 @@ struct spa {
        vdev_t          *spa_root_vdev;         /* top-level vdev container */
        uint64_t        spa_config_guid;        /* config pool guid */
        uint64_t        spa_load_guid;          /* spa_load initialized guid */
+       uint64_t        spa_last_synced_guid;   /* last synced guid */
        list_t          spa_config_dirty_list;  /* vdevs with dirty config */
        list_t          spa_state_dirty_list;   /* vdevs with dirty state */
        spa_aux_vdev_t  spa_spares;             /* hot spares */
index 51eb855..8f297a9 100644 (file)
@@ -141,7 +141,7 @@ extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd, int label);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
 extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 
 typedef enum {
index ad343e8..22e46b4 100644 (file)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -434,8 +434,8 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
        uint_t i, nspares, nl2cache;
        boolean_t config_seen;
        uint64_t best_txg;
-       char *name, *hostname, *comment;
-       uint64_t version, guid;
+       char *name, *hostname = NULL;
+       uint64_t guid;
        uint_t children = 0;
        nvlist_t **child = NULL;
        uint_t holes;
@@ -521,61 +521,54 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
                                 * configuration:
                                 *
                                 *      version
-                                *      pool guid
-                                *      name
+                                *      pool guid
+                                *      name
+                                *      pool txg (if available)
                                 *      comment (if available)
-                                *      pool state
+                                *      pool state
                                 *      hostid (if available)
                                 *      hostname (if available)
                                 */
-                               uint64_t state;
+                               uint64_t state, version, pool_txg;
+                               char *comment = NULL;
+
+                               version = fnvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_VERSION);
+                               fnvlist_add_uint64(config,
+                                   ZPOOL_CONFIG_VERSION, version);
+                               guid = fnvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_POOL_GUID);
+                               fnvlist_add_uint64(config,
+                                   ZPOOL_CONFIG_POOL_GUID, guid);
+                               name = fnvlist_lookup_string(tmp,
+                                   ZPOOL_CONFIG_POOL_NAME);
+                               fnvlist_add_string(config,
+                                   ZPOOL_CONFIG_POOL_NAME, name);
 
-                               verify(nvlist_lookup_uint64(tmp,
-                                   ZPOOL_CONFIG_VERSION, &version) == 0);
-                               if (nvlist_add_uint64(config,
-                                   ZPOOL_CONFIG_VERSION, version) != 0)
-                                       goto nomem;
-                               verify(nvlist_lookup_uint64(tmp,
-                                   ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
-                               if (nvlist_add_uint64(config,
-                                   ZPOOL_CONFIG_POOL_GUID, guid) != 0)
-                                       goto nomem;
-                               verify(nvlist_lookup_string(tmp,
-                                   ZPOOL_CONFIG_POOL_NAME, &name) == 0);
-                               if (nvlist_add_string(config,
-                                   ZPOOL_CONFIG_POOL_NAME, name) != 0)
-                                       goto nomem;
+                               if (nvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_POOL_TXG, &pool_txg) == 0)
+                                       fnvlist_add_uint64(config,
+                                           ZPOOL_CONFIG_POOL_TXG, pool_txg);
 
-                               /*
-                                * COMMENT is optional, don't bail if it's not
-                                * there, instead, set it to NULL.
-                                */
                                if (nvlist_lookup_string(tmp,
-                                   ZPOOL_CONFIG_COMMENT, &comment) != 0)
-                                       comment = NULL;
-                               else if (nvlist_add_string(config,
-                                   ZPOOL_CONFIG_COMMENT, comment) != 0)
-                                       goto nomem;
+                                   ZPOOL_CONFIG_COMMENT, &comment) == 0)
+                                       fnvlist_add_string(config,
+                                           ZPOOL_CONFIG_COMMENT, comment);
 
-                               verify(nvlist_lookup_uint64(tmp,
-                                   ZPOOL_CONFIG_POOL_STATE, &state) == 0);
-                               if (nvlist_add_uint64(config,
-                                   ZPOOL_CONFIG_POOL_STATE, state) != 0)
-                                       goto nomem;
+                               state = fnvlist_lookup_uint64(tmp,
+                                   ZPOOL_CONFIG_POOL_STATE);
+                               fnvlist_add_uint64(config,
+                                   ZPOOL_CONFIG_POOL_STATE, state);
 
                                hostid = 0;
                                if (nvlist_lookup_uint64(tmp,
                                    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-                                       if (nvlist_add_uint64(config,
-                                           ZPOOL_CONFIG_HOSTID, hostid) != 0)
-                                               goto nomem;
-                                       verify(nvlist_lookup_string(tmp,
-                                           ZPOOL_CONFIG_HOSTNAME,
-                                           &hostname) == 0);
-                                       if (nvlist_add_string(config,
-                                           ZPOOL_CONFIG_HOSTNAME,
-                                           hostname) != 0)
-                                               goto nomem;
+                                       fnvlist_add_uint64(config,
+                                           ZPOOL_CONFIG_HOSTID, hostid);
+                                       hostname = fnvlist_lookup_string(tmp,
+                                           ZPOOL_CONFIG_HOSTNAME);
+                                       fnvlist_add_string(config,
+                                           ZPOOL_CONFIG_HOSTNAME, hostname);
                                }
 
                                config_seen = B_TRUE;
index 244f10d..59c43f4 100644 (file)
@@ -117,6 +117,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 
 static dsl_syncfunc_t spa_sync_version;
 static dsl_syncfunc_t spa_sync_props;
+static dsl_checkfunc_t spa_change_guid_check;
+static dsl_syncfunc_t spa_change_guid_sync;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -676,6 +678,47 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
        }
 }
 
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t vdev_state;
+       ASSERTV(uint64_t *newguid = arg2);
+
+       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+       vdev_state = rvd->vdev_state;
+       spa_config_exit(spa, SCL_STATE, FTAG);
+
+       if (vdev_state != VDEV_STATE_HEALTHY)
+               return (ENXIO);
+
+       ASSERT3U(spa_guid(spa), !=, *newguid);
+
+       return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+       spa_t *spa = arg1;
+       uint64_t *newguid = arg2;
+       uint64_t oldguid;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       oldguid = spa_guid(spa);
+
+       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+       rvd->vdev_guid = *newguid;
+       rvd->vdev_guid_sum += (*newguid - oldguid);
+       vdev_config_dirty(rvd);
+       spa_config_exit(spa, SCL_STATE, FTAG);
+
+       spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx,
+           "old=%lld new=%lld", oldguid, *newguid);
+}
+
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
@@ -688,29 +731,23 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 int
 spa_change_guid(spa_t *spa)
 {
-       uint64_t        oldguid, newguid;
-       uint64_t        txg;
-
-       if (!(spa_mode_global & FWRITE))
-               return (EROFS);
-
-       txg = spa_vdev_enter(spa);
-
-       if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
-               return (spa_vdev_exit(spa, NULL, txg, ENXIO));
+       int error;
+       uint64_t guid;
 
-       oldguid = spa_guid(spa);
-       newguid = spa_generate_guid(NULL);
-       ASSERT3U(oldguid, !=, newguid);
+       mutex_enter(&spa_namespace_lock);
+       guid = spa_generate_guid(NULL);
 
-       spa->spa_root_vdev->vdev_guid = newguid;
-       spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
+       error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
+           spa_change_guid_sync, spa, &guid, 5);
 
-       vdev_config_dirty(spa->spa_root_vdev);
+       if (error == 0) {
+               spa_config_sync(spa, B_FALSE, B_TRUE);
+               spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
+       }
 
-       spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
+       mutex_exit(&spa_namespace_lock);
 
-       return (spa_vdev_exit(spa, NULL, txg, 0));
+       return (error);
 }
 
 /*
@@ -6083,6 +6120,9 @@ spa_sync(spa_t *spa, uint64_t txg)
                                    rvd->vdev_children, txg, B_TRUE);
                }
 
+               if (error == 0)
+                       spa->spa_last_synced_guid = rvd->vdev_guid;
+
                spa_config_exit(spa, SCL_STATE, FTAG);
 
                if (error == 0)
index 440a6ad..5ec8e68 100644 (file)
@@ -1334,16 +1334,29 @@ spa_name(spa_t *spa)
 uint64_t
 spa_guid(spa_t *spa)
 {
+       dsl_pool_t *dp = spa_get_dsl(spa);
+       uint64_t guid;
+
        /*
         * If we fail to parse the config during spa_load(), we can go through
         * the error path (which posts an ereport) and end up here with no root
         * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
         * this case.
         */
-       if (spa->spa_root_vdev != NULL)
+       if (spa->spa_root_vdev == NULL)
+               return (spa->spa_config_guid);
+
+       guid = spa->spa_last_synced_guid != 0 ?
+           spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+       /*
+        * Return the most recently synced out guid unless we're
+        * in syncing context.
+        */
+       if (dp && dsl_pool_sync_context(dp))
                return (spa->spa_root_vdev->vdev_guid);
        else
-               return (spa->spa_config_guid);
+               return (guid);
 }
 
 uint64_t
index 9335ba5..b969751 100644 (file)
@@ -1348,9 +1348,9 @@ vdev_validate(vdev_t *vd, boolean_t strict)
        if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
                uint64_t aux_guid = 0;
                nvlist_t *nvl;
+               uint64_t txg = strict ? spa->spa_config_txg : -1ULL;
 
-               if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) ==
-                   NULL) {
+               if ((label = vdev_label_read_config(vd, txg)) == NULL) {
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
                            VDEV_AUX_BAD_LABEL);
                        return (0);
@@ -1533,7 +1533,7 @@ vdev_reopen(vdev_t *vd)
                    !l2arc_vdev_present(vd))
                        l2arc_add_vdev(spa, vd);
        } else {
-               (void) vdev_validate(vd, B_TRUE);
+               (void) vdev_validate(vd, spa_last_synced_txg(spa));
        }
 
        /*
@@ -1994,7 +1994,7 @@ vdev_validate_aux(vdev_t *vd)
        if (!vdev_readable(vd))
                return (0);
 
-       if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) {
+       if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
                vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
                    VDEV_AUX_CORRUPT_DATA);
                return (-1);
index 352b630..1fe36fe 100644 (file)
@@ -433,17 +433,22 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 }
 
 /*
- * Returns the configuration from the label of the given vdev. If 'label' is
- * VDEV_BEST_LABEL, each label of the vdev will be read until a valid
- * configuration is found; otherwise, only the specified label will be read.
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
  */
 nvlist_t *
-vdev_label_read_config(vdev_t *vd, int label)
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
        spa_t *spa = vd->vdev_spa;
        nvlist_t *config = NULL;
        vdev_phys_t *vp;
        zio_t *zio;
+       uint64_t best_txg = 0;
+       int error = 0;
        int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
            ZIO_FLAG_SPECULATIVE;
        int l;
@@ -457,8 +462,7 @@ vdev_label_read_config(vdev_t *vd, int label)
 
 retry:
        for (l = 0; l < VDEV_LABELS; l++) {
-               if (label >= 0 && label < VDEV_LABELS && label != l)
-                       continue;
+               nvlist_t *label = NULL;
 
                zio = zio_root(spa, NULL, NULL, flags);
 
@@ -468,12 +472,31 @@ retry:
 
                if (zio_wait(zio) == 0 &&
                    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-                   &config, 0) == 0)
-                       break;
+                   &label, 0) == 0) {
+                       uint64_t label_txg = 0;
+
+                       /*
+                        * Auxiliary vdevs won't have txg values in their
+                        * labels and newly added vdevs may not have been
+                        * completely initialized so just return the
+                        * configuration from the first valid label we
+                        * encounter.
+                        */
+                       error = nvlist_lookup_uint64(label,
+                           ZPOOL_CONFIG_POOL_TXG, &label_txg);
+                       if ((error || label_txg == 0) && !config) {
+                               config = label;
+                               break;
+                       } else if (label_txg <= txg && label_txg > best_txg) {
+                               best_txg = label_txg;
+                               nvlist_free(config);
+                               config = fnvlist_dup(label);
+                       }
+               }
 
-               if (config != NULL) {
-                       nvlist_free(config);
-                       config = NULL;
+               if (label != NULL) {
+                       nvlist_free(label);
+                       label = NULL;
                }
        }
 
@@ -508,7 +531,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
        /*
         * Read the label, if any, and perform some basic sanity checks.
         */
-       if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL)
+       if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
                return (B_FALSE);
 
        (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
@@ -872,7 +895,6 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 struct ubl_cbdata {
        uberblock_t     *ubl_ubbest;    /* Best uberblock */
        vdev_t          *ubl_vd;        /* vdev associated with the above */
-       int             ubl_label;      /* Label associated with the above */
 };
 
 static void
@@ -891,15 +913,13 @@ vdev_uberblock_load_done(zio_t *zio)
                if (ub->ub_txg <= spa->spa_load_max_txg &&
                    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
                        /*
-                        * Keep track of the vdev and label in which this
-                        * uberblock was found. We will use this information
-                        * later to obtain the config nvlist associated with
+                        * Keep track of the vdev in which this uberblock
+                        * was found. We will use this information later
+                        * to obtain the config nvlist associated with
                         * this uberblock.
                         */
                        *cbp->ubl_ubbest = *ub;
                        cbp->ubl_vd = vd;
-                       cbp->ubl_label = vdev_label_number(vd->vdev_psize,
-                           zio->io_offset);
                }
                mutex_exit(&rio->io_lock);
        }
@@ -933,12 +953,11 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
  * Reads the 'best' uberblock from disk along with its associated
  * configuration. First, we read the uberblock array of each label of each
  * vdev, keeping track of the uberblock with the highest txg in each array.
- * Then, we read the configuration from the same label as the best uberblock.
+ * Then, we read the configuration from the same vdev as the best uberblock.
  */
 void
 vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 {
-       int i;
        zio_t *zio;
        spa_t *spa = rvd->vdev_spa;
        struct ubl_cbdata cb;
@@ -958,13 +977,15 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
        zio = zio_root(spa, NULL, &cb, flags);
        vdev_uberblock_load_impl(zio, rvd, flags, &cb);
        (void) zio_wait(zio);
-       if (cb.ubl_vd != NULL) {
-               for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) {
-                       *config = vdev_label_read_config(cb.ubl_vd, i);
-                       if (*config != NULL)
-                               break;
-               }
-       }
+
+       /*
+        * It's possible that the best uberblock was discovered on a label
+        * that has a configuration which was written in a future txg.
+        * Search all labels on this vdev to find the configuration that
+        * matches the txg for our uberblock.
+        */
+       if (cb.ubl_vd != NULL)
+               *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
        spa_config_exit(spa, SCL_ALL, FTAG);
 }