* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/zio.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
/*
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare, boolean_t isl2cache)
+ vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare && !isl2cache)
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
== 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
vd->vdev_physpath) == 0);
+ if (vd->vdev_fru != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
+ vd->vdev_fru) == 0);
+
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
VDEV_TYPE_RAIDZ) == 0);
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity == 2 &&
- spa_version(spa) >= SPA_VERSION_RAID6));
+ (vd->vdev_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vd->vdev_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
if (vd->vdev_isspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
- if (!isspare && !isl2cache && vd == vd->vdev_top) {
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
vd->vdev_asize) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
vd->vdev_islog) == 0);
+ if (vd->vdev_removing)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing) == 0);
}
if (vd->vdev_dtl_smo.smo_object != 0)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
vd->vdev_dtl_smo.smo_object) == 0);
+ if (vd->vdev_crtxg)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ vd->vdev_crtxg) == 0);
+
if (getstats) {
vdev_stat_t vs;
+ pool_scan_stat_t ps;
+
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+ /* provide either current or previous scan information */
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ VERIFY(nvlist_add_uint64_array(nv,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+ == 0);
+ }
}
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
- int c;
+ int c, idx;
+
+ ASSERT(!vd->vdev_ishole);
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
- for (c = 0; c < vd->vdev_children; c++)
- child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare, isl2cache);
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
+
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
- VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, vd->vdev_children) == 0);
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ }
- for (c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < idx; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
} else {
+ const char *aux = NULL;
+
if (vd->vdev_offline && !vd->vdev_tmpoffline)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
B_TRUE) == 0);
+ if (vd->vdev_resilvering)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ B_TRUE) == 0);
if (vd->vdev_faulted)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
B_TRUE) == 0);
if (vd->vdev_unspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
B_TRUE) == 0);
+ if (vd->vdev_ishole)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+ B_TRUE) == 0);
+
+ switch (vd->vdev_stat.vs_aux) {
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "err_exceeded";
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ aux = "external";
+ break;
+ }
+
+ if (aux != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
+ aux) == 0);
+
+ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid) == 0);
+ }
}
return (nv);
}
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t c, idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole)
+ array[idx++] = c;
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx) == 0);
+ }
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
nvlist_t *
vdev_label_read_config(vdev_t *vd)
{
nvlist_t *config = NULL;
vdev_phys_t *vp;
zio_t *zio;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
+ int l;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
vp = zio_buf_alloc(sizeof (vdev_phys_t));
- for (int l = 0; l < VDEV_LABELS; l++) {
+retry:
+ for (l = 0; l < VDEV_LABELS; l++) {
zio = zio_root(spa, NULL, NULL, flags);
}
}
+ if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
zio_buf_free(vp, sizeof (vdev_phys_t));
return (config);
case VDEV_LABEL_SPARE:
return (spa_has_spare(spa, device_guid));
+ default:
+ break;
}
}
return (B_TRUE);
/*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == FREAD)
+ state = POOL_STATE_ACTIVE;
+
+ /*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
*/
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
- vdev_boot_header_t *vb;
+ char *pad2;
uberblock_t *ub;
zio_t *zio;
char *buf;
size_t buflen;
int error;
- uint64_t spare_guid, l2cache_guid;
+ uint64_t spare_guid = 0, l2cache_guid = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int c, l;
+ vdev_t *pvd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
if ((error = vdev_label_init(vd->vdev_child[c],
crtxg, reason)) != 0)
return (error);
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
if (!vd->vdev_ops->vdev_op_leaf)
return (0);
/*
* Determine if the vdev is in use.
*/
- if (reason != VDEV_LABEL_REMOVE &&
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
return (EBUSY);
vd->vdev_guid += guid_delta;
- for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += guid_delta;
/*
*/
if (reason == VDEV_LABEL_SPARE)
return (0);
- ASSERT(reason == VDEV_LABEL_REPLACE);
+ ASSERT(reason == VDEV_LABEL_REPLACE ||
+ reason == VDEV_LABEL_SPLIT);
}
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
vd->vdev_guid += guid_delta;
- for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += guid_delta;
/*
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
} else {
- label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+ uint64_t txg = 0ULL;
+
+ if (reason == VDEV_LABEL_SPLIT)
+ txg = spa->spa_uberblock.ub_txg;
+ label = spa_config_generate(spa, vd, txg, B_FALSE);
/*
* Add our creation time. This allows us to detect multiple
}
/*
- * Initialize boot block header.
- */
- vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
- bzero(vb, sizeof (vdev_boot_header_t));
- vb->vb_magic = VDEV_BOOT_MAGIC;
- vb->vb_version = VDEV_BOOT_VERSION;
- vb->vb_offset = VDEV_BOOT_OFFSET;
- vb->vb_size = VDEV_BOOT_SIZE;
-
- /*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+ bzero(ub, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock;
ub->ub_txg = 0;
+ /* Initialize the 2nd padding area. */
+ pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+ bzero(pad2, VDEV_PAD_SIZE);
+
/*
* Write everything in parallel.
*/
+retry:
zio = zio_root(spa, NULL, NULL, flags);
- for (int l = 0; l < VDEV_LABELS; l++) {
+ for (l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, vp,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
- vdev_label_write(zio, vd, l, vb,
- offsetof(vdev_label_t, vl_boot_header),
- sizeof (vdev_boot_header_t), NULL, NULL, flags);
+ /*
+ * Skip the 1st padding area.
+ * Zero out the 2nd padding area where it might have
+ * left over data from previous filesystem format.
+ */
+ vdev_label_write(zio, vd, l, pad2,
+ offsetof(vdev_label_t, vl_pad2),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
- }
+ vdev_label_write(zio, vd, l, ub,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
nvlist_free(label);
- zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
- zio_buf_free(vb, sizeof (vdev_boot_header_t));
+ zio_buf_free(pad2, VDEV_PAD_SIZE);
+ zio_buf_free(ub, VDEV_UBERBLOCK_RING);
zio_buf_free(vp, sizeof (vdev_phys_t));
/*
*/
/*
- * For use by zdb and debugging purposes only
- */
-uint64_t ub_max_txg = UINT64_MAX;
-
-/*
* Consider the following situation: txg is safely synced to disk. We've
* written the first uberblock for txg + 1, and then we lose power. When we
* come back up, we fail to see the uberblock for txg + 1 because, say,
static void
vdev_uberblock_load_done(zio_t *zio)
{
+ spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
uberblock_t *ub = zio->io_data;
uberblock_t *ubbest = rio->io_private;
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
- if (ub->ub_txg <= ub_max_txg &&
+ if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, ubbest) > 0)
*ubbest = *ub;
mutex_exit(&rio->io_lock);
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+ int c, l, n;
if (vd == rvd) {
ASSERT(zio == NULL);
ASSERT(zio != NULL);
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
- for (int l = 0; l < VDEV_LABELS; l++) {
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
VDEV_UBERBLOCK_OFFSET(vd, n),
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
uberblock_t *ubbuf;
- int n;
+ int c, l, n;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
if (!vd->vdev_ops->vdev_op_leaf)
bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
*ubbuf = *ub;
- for (int l = 0; l < VDEV_LABELS; l++)
+ for (l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ubbuf,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private,
spa_t *spa = svd[0]->vdev_spa;
zio_t *zio;
uint64_t good_writes = 0;
+ int v;
zio = zio_root(spa, NULL, &good_writes, flags);
- for (int v = 0; v < svdcount; v++)
+ for (v = 0; v < svdcount; v++)
vdev_uberblock_sync(zio, ub, svd[v], flags);
(void) zio_wait(zio);
*/
zio = zio_root(spa, NULL, NULL, flags);
- for (int v = 0; v < svdcount; v++)
+ for (v = 0; v < svdcount; v++)
zio_flush(zio, svd[v]);
(void) zio_wait(zio);
vdev_phys_t *vp;
char *buf;
size_t buflen;
+ int c;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
if (!vd->vdev_ops->vdev_op_leaf)
zio = zio_root(spa, NULL, NULL, flags);
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
- uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
- KM_SLEEP);
- zio_t *vio = zio_null(zio, spa, NULL,
+ uint64_t *good_writes;
+ zio_t *vio;
+
+ ASSERT(!vd->vdev_ishole);
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags);
* at any time, you can just call it again, and it will resume its work.
*/
int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
int error;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ /*
+ * Normally, we don't want to try too hard to write every label and
+ * uberblock. If there is a flaky disk, we don't want the rest of the
+ * sync process to block while we retry. But if we can't write a
+ * single label out, we should retry with ZIO_FLAG_TRYHARD before
+ * bailing out and declaring the pool faulted.
+ */
+ if (tryhard)
+ flags |= ZIO_FLAG_TRYHARD;
+
ASSERT(ub->ub_txg <= txg);
/*