X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_mirror.c;h=e0884dc3ce63915ffae6509563cb09eefa5acc54;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=c4629ff45087c1efd23e45ddb715a79d72daeaac;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index c4629ff..e0884dc 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -19,10 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include #include #include @@ -37,6 +41,7 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_pending; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; @@ -50,7 +55,23 @@ typedef struct mirror_map { mirror_child_t mm_child[1]; } mirror_map_t; -int vdev_mirror_shift = 21; +/* + * When the children are equally busy queue incoming requests to a single + * child for N microseconds. This is done to maximize the likelihood that + * the Linux elevator will be able to merge requests while it is plugged. + * Otherwise, requests are queued to the least busy device. + * + * For rotational disks the Linux elevator will plug for 10ms which is + * why zfs_vdev_mirror_switch_us is set to 10ms by default. For non- + * rotational disks the elevator will not plug, but 10ms is still a small + * enough value that the requests will get spread over all the children. + * + * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us + * significantly to bound the worst case latencies. It would probably be + * ideal to calculate a decaying average of the last observed latencies and + * use that to dynamically adjust the zfs_vdev_mirror_switch_us time. + */ +int zfs_vdev_mirror_switch_us = 10000; static void vdev_mirror_map_free(zio_t *zio) @@ -60,6 +81,24 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); } +static const zio_vsd_ops_t vdev_mirror_vsd_ops = { + vdev_mirror_map_free, + zio_vsd_default_cksum_report +}; + +static int +vdev_mirror_pending(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + int pending; + + mutex_enter(&vq->vq_lock); + pending = avl_numnodes(&vq->vq_pending_tree); + mutex_exit(&vq->vq_lock); + + return (pending); +} + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -74,7 +113,7 @@ vdev_mirror_map_alloc(zio_t *zio) c = BP_GET_NDVAS(zio->io_bp); - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = B_FALSE; mm->mm_preferred = spa_get_random(c); @@ -99,51 +138,89 @@ vdev_mirror_map_alloc(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { + int lowest_pending = INT_MAX; + int lowest_nr = 1; + c = vd->vdev_children; - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; + mm->mm_preferred = 0; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (mm->mm_replacing) + continue; + + if (!vdev_readable(mc->mc_vd)) { + mc->mc_error = ENXIO; + mc->mc_tried = 1; + mc->mc_skipped = 1; + mc->mc_pending = INT_MAX; + continue; + } + + mc->mc_pending = vdev_mirror_pending(mc->mc_vd); + if (mc->mc_pending < lowest_pending) { + lowest_pending = mc->mc_pending; + lowest_nr = 1; + } else if (mc->mc_pending == lowest_pending) { + lowest_nr++; + } + } + + d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); + d = (d % lowest_nr) + 1; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mm->mm_child[c].mc_pending == lowest_pending) { + if (--d == 0) { + mm->mm_preferred = c; + break; + } + } } } zio->io_vsd = mm; - zio->io_vsd_free = vdev_mirror_map_free; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } static int -vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) { - vdev_t *cvd; - uint64_t c; int numerrors = 0; - int ret, lasterror = 0; + int lasterror = 0; + int c; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } + vdev_open_children(vd); + for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; - if ((ret = vdev_open(cvd)) != 0) { - lasterror = ret; + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } @@ -158,7 +235,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_mirror_close(vdev_t *vd) { - uint64_t c; + int c; for (c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); @@ -180,11 +257,16 @@ vdev_mirror_scrub_done(zio_t *zio) mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { - zio_t *pio = zio->io_parent; - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); - mutex_exit(&pio->io_lock); + zio_t *pio; + + mutex_enter(&zio->io_lock); + while ((pio = zio_walk_parents(zio)) != NULL) { + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + mutex_exit(&zio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); @@ -206,7 +288,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int i, c; - ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); + ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); /* * Try to find a child whose DTL doesn't contain the block to read. @@ -225,7 +307,7 @@ vdev_mirror_child_select(zio_t *zio) mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) + if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) return (c); mc->mc_error = ESTALE; mc->mc_skipped = 1; @@ -282,20 +364,10 @@ vdev_mirror_io_start(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* - * If this is a resilvering I/O to a replacing vdev, - * only the last child should be written -- unless the - * first child happens to have a DTL entry here as well. - * All other writes go to all children. + * Writes go to all children. */ - if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && - !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, - zio->io_txg, 1)) { - c = mm->mm_children - 1; - children = 1; - } else { - c = 0; - children = mm->mm_children; - } + c = 0; + children = mm->mm_children; } while (children--) { @@ -313,9 +385,9 @@ vdev_mirror_io_start(zio_t *zio) static int vdev_mirror_worst_error(mirror_map_t *mm) { - int error[2] = { 0, 0 }; + int c, error[2] = { 0, 0 }; - for (int c = 0; c < mm->mm_children; c++) { + for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc = &mm->mm_child[c]; int s = mc->mc_speculative; error[s] = zio_worst_error(error[s], mc->mc_error); @@ -398,7 +470,7 @@ vdev_mirror_io_done(zio_t *zio) ASSERT(zio->io_error != 0); } - if (good_copies && (spa_mode & FWRITE) && + if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { @@ -419,7 +491,7 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_tried) continue; if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = ESTALE; @@ -429,7 +501,8 @@ vdev_mirror_io_done(zio_t *zio) mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } @@ -453,6 +526,8 @@ vdev_ops_t vdev_mirror_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -464,6 +539,8 @@ vdev_ops_t vdev_replacing_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -475,6 +552,13 @@ vdev_ops_t vdev_spare_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_vdev_mirror_switch_us, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs"); +#endif