X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_mirror.c;h=e0884dc3ce63915ffae6509563cb09eefa5acc54;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=698c0275d34ee75e950156af6ae3961d29265d0d;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 698c027..e0884dc 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include #include #include @@ -37,6 +41,7 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_pending; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; @@ -50,7 +55,23 @@ typedef struct mirror_map { mirror_child_t mm_child[1]; } mirror_map_t; -int vdev_mirror_shift = 21; +/* + * When the children are equally busy queue incoming requests to a single + * child for N microseconds. This is done to maximize the likelihood that + * the Linux elevator will be able to merge requests while it is plugged. + * Otherwise, requests are queued to the least busy device. + * + * For rotational disks the Linux elevator will plug for 10ms which is + * why zfs_vdev_mirror_switch_us is set to 10ms by default. For non- + * rotational disks the elevator will not plug, but 10ms is still a small + * enough value that the requests will get spread over all the children. + * + * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us + * significantly to bound the worst case latencies. It would probably be + * ideal to calculate a decaying average of the last observed latencies and + * use that to dynamically adjust the zfs_vdev_mirror_switch_us time. + */ +int zfs_vdev_mirror_switch_us = 10000; static void vdev_mirror_map_free(zio_t *zio) @@ -65,6 +86,19 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = { zio_vsd_default_cksum_report }; +static int +vdev_mirror_pending(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + int pending; + + mutex_enter(&vq->vq_lock); + pending = avl_numnodes(&vq->vq_pending_tree); + mutex_exit(&vq->vq_lock); + + return (pending); +} + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -79,7 +113,7 @@ vdev_mirror_map_alloc(zio_t *zio) c = BP_GET_NDVAS(zio->io_bp); - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = B_FALSE; mm->mm_preferred = spa_get_random(c); @@ -104,20 +138,55 @@ vdev_mirror_map_alloc(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { + int lowest_pending = INT_MAX; + int lowest_nr = 1; + c = vd->vdev_children; - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; + mm->mm_preferred = 0; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (mm->mm_replacing) + continue; + + if (!vdev_readable(mc->mc_vd)) { + mc->mc_error = ENXIO; + mc->mc_tried = 1; + mc->mc_skipped = 1; + mc->mc_pending = INT_MAX; + continue; + } + + mc->mc_pending = vdev_mirror_pending(mc->mc_vd); + if (mc->mc_pending < lowest_pending) { + lowest_pending = mc->mc_pending; + lowest_nr = 1; + } else if (mc->mc_pending == lowest_pending) { + lowest_nr++; + } + } + + d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); + d = (d % lowest_nr) + 1; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mm->mm_child[c].mc_pending == lowest_pending) { + if (--d == 0) { + mm->mm_preferred = c; + break; + } + } } } @@ -127,10 +196,12 @@ vdev_mirror_map_alloc(zio_t *zio) } static int -vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) { int numerrors = 0; int lasterror = 0; + int c; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; @@ -139,7 +210,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) vdev_open_children(vd); - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) { @@ -149,6 +220,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } @@ -163,7 +235,9 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_mirror_close(vdev_t *vd) { - for (int c = 0; c < vd->vdev_children; c++) + int c; + + for (c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } @@ -311,9 +385,9 @@ vdev_mirror_io_start(zio_t *zio) static int vdev_mirror_worst_error(mirror_map_t *mm) { - int error[2] = { 0, 0 }; + int c, error[2] = { 0, 0 }; - for (int c = 0; c < mm->mm_children; c++) { + for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc = &mm->mm_child[c]; int s = mc->mc_speculative; error[s] = zio_worst_error(error[s], mc->mc_error); @@ -483,3 +557,8 @@ vdev_ops_t vdev_spare_ops = { VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_vdev_mirror_switch_us, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs"); +#endif