Update core ZFS code from build 121 to build 141.
[zfs.git] / module / zfs / vdev_label.c
index bf93046..75ec545 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zio.h>
+#include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 
 /*
@@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    boolean_t isspare, boolean_t isl2cache)
+    vdev_config_flag_t flags)
 {
        nvlist_t *nv = NULL;
 
@@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
        VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
            vd->vdev_ops->vdev_op_type) == 0);
-       if (!isspare && !isl2cache)
+       if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
                    == 0);
        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
                    vd->vdev_physpath) == 0);
 
+       if (vd->vdev_fru != NULL)
+               VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
+                   vd->vdev_fru) == 0);
+
        if (vd->vdev_nparity != 0) {
                ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
                    VDEV_TYPE_RAIDZ) == 0);
@@ -242,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                 * into a crufty old storage pool.
                 */
                ASSERT(vd->vdev_nparity == 1 ||
-                   (vd->vdev_nparity == 2 &&
-                   spa_version(spa) >= SPA_VERSION_RAID6));
+                   (vd->vdev_nparity <= 2 &&
+                   spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+                   (vd->vdev_nparity <= 3 &&
+                   spa_version(spa) >= SPA_VERSION_RAIDZ3));
 
                /*
                 * Note that we'll add the nparity tag even on storage pools
@@ -264,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
        if (vd->vdev_isspare)
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
 
-       if (!isspare && !isl2cache && vd == vd->vdev_top) {
+       if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+           vd == vd->vdev_top) {
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
                    vd->vdev_ms_array) == 0);
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -275,39 +282,74 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                    vd->vdev_asize) == 0);
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
                    vd->vdev_islog) == 0);
+               if (vd->vdev_removing)
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+                           vd->vdev_removing) == 0);
        }
 
-       if (vd->vdev_dtl.smo_object != 0)
+       if (vd->vdev_dtl_smo.smo_object != 0)
                VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-                   vd->vdev_dtl.smo_object) == 0);
+                   vd->vdev_dtl_smo.smo_object) == 0);
+
+       if (vd->vdev_crtxg)
+               VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+                   vd->vdev_crtxg) == 0);
 
        if (getstats) {
                vdev_stat_t vs;
+               pool_scan_stat_t ps;
+
                vdev_get_stats(vd, &vs);
-               VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+               VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
                    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+               /* provide either current or previous scan information */
+               if (spa_scan_get_stats(spa, &ps) == 0) {
+                       VERIFY(nvlist_add_uint64_array(nv,
+                           ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+                           sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+                           == 0);
+               }
        }
 
        if (!vd->vdev_ops->vdev_op_leaf) {
                nvlist_t **child;
-               int c;
+               int c, idx;
+
+               ASSERT(!vd->vdev_ishole);
 
                child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
                    KM_SLEEP);
 
-               for (c = 0; c < vd->vdev_children; c++)
-                       child[c] = vdev_config_generate(spa, vd->vdev_child[c],
-                           getstats, isspare, isl2cache);
+               for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+                       vdev_t *cvd = vd->vdev_child[c];
+
+                       /*
+                        * If we're generating an nvlist of removing
+                        * vdevs then skip over any device which is
+                        * not being removed.
+                        */
+                       if ((flags & VDEV_CONFIG_REMOVING) &&
+                           !cvd->vdev_removing)
+                               continue;
 
-               VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-                   child, vd->vdev_children) == 0);
+                       child[idx++] = vdev_config_generate(spa, cvd,
+                           getstats, flags);
+               }
+
+               if (idx) {
+                       VERIFY(nvlist_add_nvlist_array(nv,
+                           ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+               }
 
-               for (c = 0; c < vd->vdev_children; c++)
+               for (c = 0; c < idx; c++)
                        nvlist_free(child[c]);
 
                kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
        } else {
+               const char *aux = NULL;
+
                if (vd->vdev_offline && !vd->vdev_tmpoffline)
                        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
                            B_TRUE) == 0);
@@ -323,11 +365,66 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                if (vd->vdev_unspare)
                        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
                            B_TRUE) == 0);
+               if (vd->vdev_ishole)
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+                           B_TRUE) == 0);
+
+               switch (vd->vdev_stat.vs_aux) {
+               case VDEV_AUX_ERR_EXCEEDED:
+                       aux = "err_exceeded";
+                       break;
+
+               case VDEV_AUX_EXTERNAL:
+                       aux = "external";
+                       break;
+               }
+
+               if (aux != NULL)
+                       VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
+                           aux) == 0);
+
+               if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+                           vd->vdev_orig_guid) == 0);
+               }
        }
 
        return (nv);
 }
 
+/*
+ * Generate a view of the top-level vdevs.  If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs.  Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t *array;
+       uint_t c, idx;
+
+       array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+       for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+
+               if (tvd->vdev_ishole)
+                       array[idx++] = c;
+       }
+
+       if (idx) {
+               VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+                   array, idx) == 0);
+       }
+
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+           rvd->vdev_children) == 0);
+
+       kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
 nvlist_t *
 vdev_label_read_config(vdev_t *vd)
 {
@@ -335,8 +432,8 @@ vdev_label_read_config(vdev_t *vd)
        nvlist_t *config = NULL;
        vdev_phys_t *vp;
        zio_t *zio;
-       int flags =
-           ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+       int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+           ZIO_FLAG_SPECULATIVE;
 
        ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
@@ -345,6 +442,7 @@ vdev_label_read_config(vdev_t *vd)
 
        vp = zio_buf_alloc(sizeof (vdev_phys_t));
 
+retry:
        for (int l = 0; l < VDEV_LABELS; l++) {
 
                zio = zio_root(spa, NULL, NULL, flags);
@@ -364,6 +462,11 @@ vdev_label_read_config(vdev_t *vd)
                }
        }
 
+       if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+               flags |= ZIO_FLAG_TRYHARD;
+               goto retry;
+       }
+
        zio_buf_free(vp, sizeof (vdev_phys_t));
 
        return (config);
@@ -488,7 +591,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
        spa_t *spa = vd->vdev_spa;
        nvlist_t *label;
        vdev_phys_t *vp;
-       vdev_boot_header_t *vb;
+       char *pad2;
        uberblock_t *ub;
        zio_t *zio;
        char *buf;
@@ -504,6 +607,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                    crtxg, reason)) != 0)
                        return (error);
 
+       /* Track the creation time for this vdev */
+       vd->vdev_crtxg = crtxg;
+
        if (!vd->vdev_ops->vdev_op_leaf)
                return (0);
 
@@ -516,13 +622,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
        /*
         * Determine if the vdev is in use.
         */
-       if (reason != VDEV_LABEL_REMOVE &&
+       if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
            vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
                return (EBUSY);
 
-       ASSERT(reason != VDEV_LABEL_REMOVE ||
-           vdev_inuse(vd, crtxg, reason, NULL, NULL));
-
        /*
         * If this is a request to add or replace a spare or l2cache device
         * that is in use elsewhere on the system, then we must update the
@@ -545,7 +648,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                 */
                if (reason == VDEV_LABEL_SPARE)
                        return (0);
-               ASSERT(reason == VDEV_LABEL_REPLACE);
+               ASSERT(reason == VDEV_LABEL_REPLACE ||
+                   reason == VDEV_LABEL_SPLIT);
        }
 
        if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
@@ -610,7 +714,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
                    vd->vdev_guid) == 0);
        } else {
-               label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+               uint64_t txg = 0ULL;
+
+               if (reason == VDEV_LABEL_SPLIT)
+                       txg = spa->spa_uberblock.ub_txg;
+               label = spa_config_generate(spa, vd, txg, B_FALSE);
 
                /*
                 * Add our creation time.  This allows us to detect multiple
@@ -633,26 +741,21 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
        }
 
        /*
-        * Initialize boot block header.
-        */
-       vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
-       bzero(vb, sizeof (vdev_boot_header_t));
-       vb->vb_magic = VDEV_BOOT_MAGIC;
-       vb->vb_version = VDEV_BOOT_VERSION;
-       vb->vb_offset = VDEV_BOOT_OFFSET;
-       vb->vb_size = VDEV_BOOT_SIZE;
-
-       /*
         * Initialize uberblock template.
         */
-       ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
-       bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+       ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+       bzero(ub, VDEV_UBERBLOCK_RING);
        *ub = spa->spa_uberblock;
        ub->ub_txg = 0;
 
+       /* Initialize the 2nd padding area. */
+       pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+       bzero(pad2, VDEV_PAD_SIZE);
+
        /*
         * Write everything in parallel.
         */
+retry:
        zio = zio_root(spa, NULL, NULL, flags);
 
        for (int l = 0; l < VDEV_LABELS; l++) {
@@ -661,22 +764,30 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
                    offsetof(vdev_label_t, vl_vdev_phys),
                    sizeof (vdev_phys_t), NULL, NULL, flags);
 
-               vdev_label_write(zio, vd, l, vb,
-                   offsetof(vdev_label_t, vl_boot_header),
-                   sizeof (vdev_boot_header_t), NULL, NULL, flags);
+               /*
+                * Skip the 1st padding area.
+                * Zero out the 2nd padding area where it might have
+                * left over data from previous filesystem format.
+                */
+               vdev_label_write(zio, vd, l, pad2,
+                   offsetof(vdev_label_t, vl_pad2),
+                   VDEV_PAD_SIZE, NULL, NULL, flags);
 
-               for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-                       vdev_label_write(zio, vd, l, ub,
-                           VDEV_UBERBLOCK_OFFSET(vd, n),
-                           VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
-               }
+               vdev_label_write(zio, vd, l, ub,
+                   offsetof(vdev_label_t, vl_uberblock),
+                   VDEV_UBERBLOCK_RING, NULL, NULL, flags);
        }
 
        error = zio_wait(zio);
 
+       if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+               flags |= ZIO_FLAG_TRYHARD;
+               goto retry;
+       }
+
        nvlist_free(label);
-       zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
-       zio_buf_free(vb, sizeof (vdev_boot_header_t));
+       zio_buf_free(pad2, VDEV_PAD_SIZE);
+       zio_buf_free(ub, VDEV_UBERBLOCK_RING);
        zio_buf_free(vp, sizeof (vdev_phys_t));
 
        /*
@@ -733,6 +844,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+       spa_t *spa = zio->io_spa;
        zio_t *rio = zio->io_private;
        uberblock_t *ub = zio->io_data;
        uberblock_t *ubbest = rio->io_private;
@@ -741,7 +853,8 @@ vdev_uberblock_load_done(zio_t *zio)
 
        if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
                mutex_enter(&rio->io_lock);
-               if (vdev_uberblock_compare(ub, ubbest) > 0)
+               if (ub->ub_txg <= spa->spa_load_max_txg &&
+                   vdev_uberblock_compare(ub, ubbest) > 0)
                        *ubbest = *ub;
                mutex_exit(&rio->io_lock);
        }
@@ -754,8 +867,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
 {
        spa_t *spa = vd->vdev_spa;
        vdev_t *rvd = spa->spa_root_vdev;
-       int flags =
-           ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+       int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+           ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
        if (vd == rvd) {
                ASSERT(zio == NULL);
@@ -958,7 +1071,10 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
        for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
                uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
                    KM_SLEEP);
-               zio_t *vio = zio_null(zio, spa,
+
+               ASSERT(!vd->vdev_ishole);
+
+               zio_t *vio = zio_null(zio, spa, NULL,
                    (vd->vdev_islog || vd->vdev_aux != NULL) ?
                    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
                    good_writes, flags);
@@ -993,7 +1109,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
  * at any time, you can just call it again, and it will resume its work.
  */
 int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
 {
        spa_t *spa = svd[0]->vdev_spa;
        uberblock_t *ub = &spa->spa_uberblock;
@@ -1002,6 +1118,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
        int error;
        int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
+       /*
+        * Normally, we don't want to try too hard to write every label and
+        * uberblock.  If there is a flaky disk, we don't want the rest of the
+        * sync process to block while we retry.  But if we can't write a
+        * single label out, we should retry with ZIO_FLAG_TRYHARD before
+        * bailing out and declaring the pool faulted.
+        */
+       if (tryhard)
+               flags |= ZIO_FLAG_TRYHARD;
+
        ASSERT(ub->ub_txg <= txg);
 
        /*