X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_label.c;h=1fe36feb2491ac3e41be9445ec135b62560f2b57;hb=36f86f73f68548f46eb3229c8adf583d59fa9988;hp=c08ed8ba0467eb74e48da5118bf488955ba69e46;hpb=572e285762521df27fe5b026f409ba1a21abb7ac;p=zfs.git diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c08ed8b..1fe36fe 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -121,6 +123,8 @@ * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * @@ -212,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, { nvlist_t *nv = NULL; - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); @@ -319,7 +323,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), - KM_SLEEP); + KM_PUSHPAGE); for (c = 0, idx = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; @@ -408,7 +412,7 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) uint64_t *array; uint_t c, idx; - array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); + array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_PUSHPAGE); for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; @@ -428,15 +432,26 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } +/* + * Returns the configuration from the label of the given vdev. For vdevs + * which don't have a txg value stored on their label (i.e. spares/cache) + * or have not been completely initialized (txg = 0) just return + * the configuration from the first valid label we find. Otherwise, + * find the most up-to-date label that does not exceed the specified + * 'txg' value. + */ nvlist_t * -vdev_label_read_config(vdev_t *vd) +vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; zio_t *zio; + uint64_t best_txg = 0; + int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + int l; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -446,7 +461,8 @@ vdev_label_read_config(vdev_t *vd) vp = zio_buf_alloc(sizeof (vdev_phys_t)); retry: - for (int l = 0; l < VDEV_LABELS; l++) { + for (l = 0; l < VDEV_LABELS; l++) { + nvlist_t *label = NULL; zio = zio_root(spa, NULL, NULL, flags); @@ -456,12 +472,31 @@ retry: if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), - &config, 0) == 0) - break; + &label, 0) == 0) { + uint64_t label_txg = 0; - if (config != NULL) { - nvlist_free(config); - config = NULL; + /* + * Auxiliary vdevs won't have txg values in their + * labels and newly added vdevs may not have been + * completely initialized so just return the + * configuration from the first valid label we + * encounter. + */ + error = nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_TXG, &label_txg); + if ((error || label_txg == 0) && !config) { + config = label; + break; + } else if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + nvlist_free(config); + config = fnvlist_dup(label); + } + } + + if (label != NULL) { + nvlist_free(label); + label = NULL; } } @@ -496,7 +531,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd)) == NULL) + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -564,6 +599,8 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, case VDEV_LABEL_SPARE: return (spa_has_spare(spa, device_guid)); + default: + break; } } @@ -609,12 +646,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) char *buf; size_t buflen; int error; - uint64_t spare_guid, l2cache_guid; + uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int c, l; + vdev_t *pvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (int c = 0; c < vd->vdev_children; c++) + for (c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); @@ -650,7 +689,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vd->vdev_guid += guid_delta; - for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* @@ -670,7 +709,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vd->vdev_guid += guid_delta; - for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* @@ -704,7 +743,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * active hot spare (in which case we want to revert the * labels). */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); @@ -717,7 +756,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * For level 2 ARC devices, add a special label. */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); @@ -744,7 +783,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); - error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE); if (error != 0) { nvlist_free(label); zio_buf_free(vp, sizeof (vdev_phys_t)); @@ -770,7 +809,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) retry: zio = zio_root(spa, NULL, NULL, flags); - for (int l = 0; l < VDEV_LABELS; l++) { + for (l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, vp, offsetof(vdev_label_t, vl_vdev_phys), @@ -833,7 +872,7 @@ retry: * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two + * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ @@ -853,50 +892,53 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) return (0); } +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ +}; + static void vdev_uberblock_load_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = rio->io_private; + struct ubl_cbdata *cbp = rio->io_private; - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev in which this uberblock + * was found. We will use this information later + * to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + } mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); } -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + int c, l, n; - if (vd == rvd) { - ASSERT(zio == NULL); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, ubbest, flags); - bzero(ubbest, sizeof (uberblock_t)); - } - - ASSERT(zio != NULL); - - for (int c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + for (c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { - for (int l = 0; l < VDEV_LABELS; l++) { - for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + for (l = 0; l < VDEV_LABELS; l++) { + for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), VDEV_UBERBLOCK_OFFSET(vd, n), @@ -905,11 +947,46 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) } } } +} - if (vd == rvd) { - (void) zio_wait(zio); - spa_config_exit(spa, SCL_ALL, FTAG); - } +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same vdev as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + + /* + * It's possible that the best uberblock was discovered on a label + * that has a configuration which was written in a future txg. + * Search all labels on this vdev to find the configuration that + * matches the txg for our uberblock. + */ + if (cb.ubl_vd != NULL) + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@ -932,9 +1009,9 @@ static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { uberblock_t *ubbuf; - int n; + int c, l, n; - for (int c = 0; c < vd->vdev_children; c++) + for (c = 0; c < vd->vdev_children; c++) vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); if (!vd->vdev_ops->vdev_op_leaf) @@ -949,7 +1026,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); *ubbuf = *ub; - for (int l = 0; l < VDEV_LABELS; l++) + for (l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ubbuf, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, @@ -964,10 +1041,11 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; + int v; zio = zio_root(spa, NULL, &good_writes, flags); - for (int v = 0; v < svdcount; v++) + for (v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, ub, svd[v], flags); (void) zio_wait(zio); @@ -979,7 +1057,7 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) */ zio = zio_root(spa, NULL, NULL, flags); - for (int v = 0; v < svdcount; v++) + for (v = 0; v < svdcount; v++) zio_flush(zio, svd[v]); (void) zio_wait(zio); @@ -1032,8 +1110,9 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) vdev_phys_t *vp; char *buf; size_t buflen; + int c; - for (int c = 0; c < vd->vdev_children; c++) + for (c = 0; c < vd->vdev_children; c++) vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); if (!vd->vdev_ops->vdev_op_leaf) @@ -1053,7 +1132,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); - if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) { + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE) == 0) { for (; l < VDEV_LABELS; l += 2) { vdev_label_write(zio, vd, l, vp, offsetof(vdev_label_t, vl_vdev_phys), @@ -1081,12 +1160,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { - uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), - KM_SLEEP); + uint64_t *good_writes; + zio_t *vio; ASSERT(!vd->vdev_ishole); - zio_t *vio = zio_null(zio, spa, NULL, + good_writes = kmem_zalloc(sizeof (uint64_t), KM_PUSHPAGE); + vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags);