X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev.c;h=bb5024f985f566e7ab12f1dba70ab1655bc16d26;hb=6119cb885a976e175a6e827894accf657ff1984f;hp=d9689e803b5edd6b1b83f984a33bbb27568017a2;hpb=fb5f0bc83330c8a0236c4d34a23723ac1974971a;p=zfs.git diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d9689e8..bb5024f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,6 +39,7 @@ #include #include #include +#include /* * Virtual device management. @@ -83,9 +84,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; - uint64_t c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } @@ -94,40 +94,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) } /* - * Get the replaceable or attachable device size. - * If the parent is a mirror or raidz, the replaceable size is the minimum - * psize of all its children. For the rest, just return our own psize. - * - * e.g. - * psize rsize - * root - - - * mirror/raidz - - - * disk1 20g 20g - * disk2 40g 20g - * disk3 80g 80g + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. */ uint64_t -vdev_get_rsize(vdev_t *vd) +vdev_get_min_asize(vdev_t *vd) { - vdev_t *pvd, *cvd; - uint64_t c, rsize; + vdev_t *pvd = vd->vdev_parent; - pvd = vd->vdev_parent; + /* + * The our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); /* - * If our parent is NULL or the root, just return our own psize. + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. */ - if (pvd == NULL || pvd->vdev_parent == NULL) - return (vd->vdev_psize); + if (vd == vd->vdev_top) + return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - rsize = 0; + /* + * The allocatable space for a raidz vdev is N * sizeof(smallest child), + * so each child must provide at least 1/Nth of its asize. + */ + if (pvd->vdev_ops == &vdev_raidz_ops) + return (pvd->vdev_min_asize / pvd->vdev_children); - for (c = 0; c < pvd->vdev_children; c++) { - cvd = pvd->vdev_child[c]; - rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; - } + return (pvd->vdev_min_asize); +} - return (rsize); +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * @@ -148,13 +155,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { - int c; vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); @@ -250,17 +256,17 @@ vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; - int newc, c; + int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (c = newc = 0; c < oldc; c++) + for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); - for (c = newc = 0; c < oldc; c++) { + for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; @@ -372,6 +378,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (EINVAL); + } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); } /* @@ -396,22 +405,26 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* - * Currently, we can only support 2 parity devices. + * Currently, we can only support 3 parity devices. */ - if (nparity == 0 || nparity > 2) + if (nparity == 0 || nparity > 3) return (EINVAL); /* - * Older versions can only support 1 parity device. + * Previous versions could only support 1 or 2 parity + * device. */ - if (nparity == 2 && - spa_version(spa) < SPA_VERSION_RAID6) + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (ENOTSUP); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) return (ENOTSUP); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ - if (spa_version(spa) >= SPA_VERSION_RAID6) + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (EINVAL); /* * Otherwise, we default to 1 parity device for RAID-Z. @@ -435,6 +448,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) + vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value @@ -448,9 +463,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ - if (!spa->spa_import_faulted) - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); /* * Get the alignment requirement. @@ -473,13 +487,23 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || + alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_smo.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } + + if (alloctype == VDEV_ALLOC_ROOTPOOL) { + uint64_t spare = 0; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare) == 0 && spare) + spa_spare_add(vd); + } + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); @@ -511,7 +535,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, void vdev_free(vdev_t *vd) { - int c; spa_t *spa = vd->vdev_spa; /* @@ -525,7 +548,7 @@ vdev_free(vdev_t *vd) /* * Free all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); @@ -560,6 +583,8 @@ vdev_free(vdev_t *vd) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); + if (vd->vdev_fru) + spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); @@ -653,14 +678,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { - int c; - if (vd == NULL) return; vd->vdev_top = tvd; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } @@ -679,6 +702,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; @@ -751,6 +775,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ return (0); + /* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the current "typical" blocksize. + * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, + * or we will inconsistently account for existing bp's. + */ + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + ASSERT(oldc <= newc); if (vd->vdev_islog) @@ -776,7 +809,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) { uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, - m * sizeof (uint64_t), sizeof (uint64_t), &object); + m * sizeof (uint64_t), sizeof (uint64_t), &object, + DMU_READ_PREFETCH); if (error) return (error); if (object != 0) { @@ -816,23 +850,22 @@ typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; - zio_t *vps_root; - vdev_t *vps_vd; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; - vdev_t *vd = vps->vps_vd; + + ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { - zio_nowait(zio_write_phys(vps->vps_root, vd, + zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); @@ -840,13 +873,11 @@ vdev_probe_done(zio_t *zio) zio_buf_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_writeable = 1; zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { - ASSERT(zio->io_vd == NULL); - ASSERT(zio == vps->vps_root); + zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; @@ -860,6 +891,16 @@ vdev_probe_done(zio_t *zio) spa, vd, NULL, 0, 0); zio->io_error = ENXIO; } + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + while ((pio = zio_walk_parents(zio)) != NULL) + if (!vdev_accessible(vd, pio)) + pio->io_error = ENXIO; + kmem_free(vps, sizeof (*vps)); } } @@ -870,53 +911,116 @@ vdev_probe_done(zio_t *zio) * but the first (which we leave alone in case it contains a VTOC). */ zio_t * -vdev_probe(vdev_t *vd, zio_t *pio) +vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; - vdev_probe_stats_t *vps; - zio_t *zio; + vdev_probe_stats_t *vps = NULL; + zio_t *pio; - vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + ASSERT(vd->vdev_ops->vdev_op_leaf); - vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; + /* + * Don't probe the probe. + */ + if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) + return (NULL); - if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { - /* - * vdev_cant_read and vdev_cant_write can only transition - * from TRUE to FALSE when we have the SCL_ZIO lock as writer; - * otherwise they can only transition from FALSE to TRUE. - * This ensures that any zio looking at these values can - * assume that failures persist for the life of the I/O. - * That's important because when a device has intermittent - * connectivity problems, we want to ensure that they're - * ascribed to the device (ENXIO) and not the zio (EIO). - * - * Since we hold SCL_ZIO as writer here, clear both values - * so the probe can reevaluate from first principles. - */ - vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will become parents of the probe io. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_TRYHARD; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only + * transition from TRUE to FALSE when we have the + * SCL_ZIO lock as writer; otherwise they can only + * transition from FALSE to TRUE. This ensures that + * any zio looking at these values can assume that + * failures persist for the life of the I/O. That's + * important because when a device has intermittent + * connectivity problems, we want to ensure that + * they're ascribed to the device (ENXIO) and not + * the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both + * values so the probe can reevaluate from first + * principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, + vdev_probe_done, vps, + vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + + if (zio != NULL) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_PROBE); + } } - ASSERT(vd->vdev_ops->vdev_op_leaf); + if (zio != NULL) + zio_add_child(zio, pio); - zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); + mutex_exit(&vd->vdev_probe_lock); - vps->vps_root = zio; - vps->vps_vd = vd; + if (vps == NULL) { + ASSERT(zio != NULL); + return (NULL); + } for (int l = 1; l < VDEV_LABELS; l++) { - zio_nowait(zio_read_phys(zio, vd, + zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad)), - VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), + offsetof(vdev_label_t, vl_pad2)), + VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } - return (zio); + if (zio == NULL) + return (pio); + + zio_nowait(pio); + return (NULL); +} + +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], + TQ_SLEEP) != NULL); + + taskq_destroy(tq); } /* @@ -927,18 +1031,20 @@ vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; - int c; uint64_t osize = 0; uint64_t asize, psize; uint64_t ashift = 0; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + vd->vdev_min_asize = vdev_get_min_asize(vd); if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); @@ -954,7 +1060,7 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); if (zio_injection_enabled && error == 0) - error = zio_handle_device_injection(vd, ENXIO); + error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && @@ -976,12 +1082,13 @@ vdev_open(vdev_t *vd) vd->vdev_state = VDEV_STATE_HEALTHY; } - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } + } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); @@ -1006,6 +1113,15 @@ vdev_open(vdev_t *vd) vd->vdev_psize = psize; + /* + * Make sure the allocatable size hasn't shrunk. + */ + if (asize < vd->vdev_min_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1022,25 +1138,18 @@ vdev_open(vdev_t *vd) VDEV_AUX_BAD_LABEL); return (EINVAL); } + } - /* - * Make sure the device hasn't shrunk. - */ - if (asize < vd->vdev_asize) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. If automatic + * expansion is enabled then use the additional space. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && + (vd->vdev_expanding || spa->spa_autoexpand)) + vd->vdev_asize = asize; - /* - * If all children are healthy and the asize has increased, - * then we've experienced dynamic LUN growth. - */ - if (vd->vdev_state == VDEV_STATE_HEALTHY && - asize > vd->vdev_asize) { - vd->vdev_asize = asize; - } - } + vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the @@ -1054,18 +1163,6 @@ vdev_open(vdev_t *vd) } /* - * If this is a top-level vdev, compute the raidz-deflation - * ratio. Note, we hard-code in 128k (1<<17) because it is the - * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE - * changes, this algorithm must never change, or we will - * inconsistently account for existing bp's. - */ - if (vd->vdev_top == vd) { - vd->vdev_deflate_ratio = (1<<17) / - (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); - } - - /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. @@ -1091,12 +1188,11 @@ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - int c; nvlist_t *label; uint64_t guid, top_guid; uint64_t state; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (EBADF); @@ -1151,7 +1247,12 @@ vdev_validate(vdev_t *vd) nvlist_free(label); - if (spa->spa_load_state == SPA_LOAD_OPEN && + /* + * If spa->spa_load_verbatim is true, no need to check the + * state of the pool. + */ + if (!spa->spa_load_verbatim && + spa->spa_load_state == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -1182,7 +1283,7 @@ vdev_close(vdev_t *vd) vdev_cache_purge(vd); /* - * We record the previous state before we close it, so that if we are + * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ @@ -1213,12 +1314,9 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - !l2arc_vdev_present(vd)) { - uint64_t size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + vd->vdev_aux == &spa->spa_l2cache && + !l2arc_vdev_present(vd)) + l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } @@ -1258,26 +1356,14 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) return (0); } -/* - * The is the latter half of vdev_create(). It is distinct because it - * involves initiating transactions in order to do metaslab creation. - * For creation, we want to try to create all vdevs at once and then undo it - * if anything fails; this is much harder if we have pending transactions. - */ void -vdev_init(vdev_t *vd, uint64_t txg) +vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly 200 metaslabs per vdev. */ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); - - /* - * Initialize the vdev's metaslabs. This can't fail because - * there's nothing to read when creating all new metaslabs. - */ - VERIFY(vdev_metaslab_init(vd, txg) == 0); } void @@ -1835,7 +1921,7 @@ vdev_degrade(spa_t *spa, uint64_t guid) int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { - vdev_t *vd; + vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; spa_vdev_state_enter(spa); @@ -1845,13 +1931,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); - vdev_reopen(vd->vdev_top); + + /* XXX - L2ARC 1.0 does not support expansion */ + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + } + + vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = B_FALSE; + } + if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && @@ -1860,13 +1959,21 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; + if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { + + /* XXX - L2ARC 1.0 does not support expansion */ + if (vd->vdev_aux) + return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { - vdev_t *vd; + vdev_t *vd, *tvd; + int error; spa_vdev_state_enter(spa); @@ -1876,34 +1983,58 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, - * don't allow it to be offlined. + * don't allow it to be offlined. Log devices are always + * expendable. */ - if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * Offline this device and reopen its top-level vdev. - * If this action results in the top-level vdev becoming - * unusable, undo it and fail the request. + * If the top-level vdev is a log device then just offline + * it. Otherwise, if this action results in the top-level + * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top); - if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) { + vdev_reopen(tvd); + + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; - vdev_reopen(vd->vdev_top); + vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - return (spa_vdev_state_exit(spa, vd, 0)); + if (!tvd->vdev_islog || !vdev_is_dead(tvd)) + return (spa_vdev_state_exit(spa, vd, 0)); + + (void) spa_vdev_state_exit(spa, vd, 0); + + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + if (error) { + (void) vdev_online(spa, guid, 0, NULL); + return (error); + } + /* + * If we successfully offlined the log device then we need to + * sync out the current txg so that the "stubby" block can be + * removed by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + return (0); } /* @@ -2018,7 +2149,9 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_rsize(vd); + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; mutex_exit(&vd->vdev_stat_lock); /* @@ -2111,14 +2244,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (flags & ZIO_FLAG_SPECULATIVE) return; + /* + * If this is an I/O error that is going to be retried, then ignore the + * error. Otherwise, the user may interpret B_FAILFAST I/O errors as + * hard errors, when in reality they can happen for any number of + * innocuous reasons (bus resets, MPxIO link failure, etc). + */ + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) + return; + mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ) { + if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); @@ -2161,10 +2304,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) { - int c; vdev_stat_t *vs = &vd->vdev_stat; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_scrub_stat_update(vd->vdev_child[c], type, complete); mutex_enter(&vd->vdev_stat_lock); @@ -2208,6 +2350,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; @@ -2249,8 +2392,8 @@ vdev_config_dirty(vdev_t *vd) int c; /* - * If this is an aux vdev (as with l2cache devices), then we update the - * vdev config manually and set the sync flag. + * If this is an aux vdev (as with l2cache and spare devices), then we + * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; @@ -2272,8 +2415,11 @@ vdev_config_dirty(vdev_t *vd) sav->sav_sync = B_TRUE; - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); + if (nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + } ASSERT(c < naux); @@ -2371,11 +2517,10 @@ vdev_propagate_state(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; - int c; vdev_t *child; if (vd->vdev_children > 0) { - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; if (!vdev_readable(child) || @@ -2479,7 +2624,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * an error. */ if (spa->spa_load_state == SPA_LOAD_IMPORT && - !spa->spa_import_faulted && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2538,8 +2682,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_removed = B_FALSE; } - if (!isopen) - vdev_propagate_state(vd); + if (!isopen && vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -2551,8 +2695,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) boolean_t vdev_is_bootable(vdev_t *vd) { - int c; - if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; @@ -2567,9 +2709,53 @@ vdev_is_bootable(vdev_t *vd) return (B_FALSE); } - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } + +void +vdev_load_log_state(vdev_t *vd, nvlist_t *nv) +{ + uint_t children; + nvlist_t **child; + uint64_t val; + spa_t *spa = vd->vdev_spa; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (int c = 0; c < children; c++) + vdev_load_log_state(vd->vdev_child[c], child[c]); + } + + if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) { + + /* + * It would be nice to call vdev_offline() + * directly but the pool isn't fully loaded and + * the txg threads have not been started yet. + */ + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); + vd->vdev_offline = val; + vdev_reopen(vd->vdev_top); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + } +} + +/* + * Expand a vdev if possible. + */ +void +vdev_expand(vdev_t *vd, uint64_t txg) +{ + ASSERT(vd->vdev_top == vd); + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + VERIFY(vdev_metaslab_init(vd, txg) == 0); + vdev_config_dirty(vd); + } +}