* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
+ int mc_pending;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
mirror_child_t mm_child[1];
} mirror_map_t;
-int vdev_mirror_shift = 21;
+/*
+ * When the children are equally busy queue incoming requests to a single
+ * child for N microseconds. This is done to maximize the likelihood that
+ * the Linux elevator will be able to merge requests while it is plugged.
+ * Otherwise, requests are queued to the least busy device.
+ *
+ * For rotational disks the Linux elevator will plug for 10ms which is
+ * why zfs_vdev_mirror_switch_us is set to 10ms by default. For non-
+ * rotational disks the elevator will not plug, but 10ms is still a small
+ * enough value that the requests will get spread over all the children.
+ *
+ * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us
+ * significantly to bound the worst case latencies. It would probably be
+ * ideal to calculate a decaying average of the last observed latencies and
+ * use that to dynamically adjust the zfs_vdev_mirror_switch_us time.
+ */
+int zfs_vdev_mirror_switch_us = 10000;
static void
vdev_mirror_map_free(zio_t *zio)
kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
}
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+ vdev_mirror_map_free,
+ zio_vsd_default_cksum_report
+};
+
+static int
+vdev_mirror_pending(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+ int pending;
+
+ mutex_enter(&vq->vq_lock);
+ pending = avl_numnodes(&vq->vq_pending_tree);
+ mutex_exit(&vq->vq_lock);
+
+ return (pending);
+}
+
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
c = BP_GET_NDVAS(zio->io_bp);
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
mm->mm_children = c;
mm->mm_replacing = B_FALSE;
mm->mm_preferred = spa_get_random(c);
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
+ int lowest_pending = INT_MAX;
+ int lowest_nr = 1;
+
c = vd->vdev_children;
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
mm->mm_children = c;
mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
- mm->mm_preferred = mm->mm_replacing ? 0 :
- (zio->io_offset >> vdev_mirror_shift) % c;
+ mm->mm_preferred = 0;
mm->mm_root = B_FALSE;
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;
+
+ if (mm->mm_replacing)
+ continue;
+
+ if (!vdev_readable(mc->mc_vd)) {
+ mc->mc_error = ENXIO;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 1;
+ mc->mc_pending = INT_MAX;
+ continue;
+ }
+
+ mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
+ if (mc->mc_pending < lowest_pending) {
+ lowest_pending = mc->mc_pending;
+ lowest_nr = 1;
+ } else if (mc->mc_pending == lowest_pending) {
+ lowest_nr++;
+ }
+ }
+
+ d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us);
+ d = (d % lowest_nr) + 1;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mm->mm_child[c].mc_pending == lowest_pending) {
+ if (--d == 0) {
+ mm->mm_preferred = c;
+ break;
+ }
+ }
}
}
zio->io_vsd = mm;
- zio->io_vsd_free = vdev_mirror_map_free;
+ zio->io_vsd_ops = &vdev_mirror_vsd_ops;
return (mm);
}
static int
-vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *ashift)
{
int numerrors = 0;
int lasterror = 0;
+ int c;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
vdev_open_children(vd);
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error) {
}
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*ashift = MAX(*ashift, cvd->vdev_ashift);
}
static void
vdev_mirror_close(vdev_t *vd)
{
- for (int c = 0; c < vd->vdev_children; c++)
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
uint64_t txg = zio->io_txg;
int i, c;
- ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
/*
* Try to find a child whose DTL doesn't contain the block to read.
static int
vdev_mirror_worst_error(mirror_map_t *mm)
{
- int error[2] = { 0, 0 };
+ int c, error[2] = { 0, 0 };
- for (int c = 0; c < mm->mm_children; c++) {
+ for (c = 0; c < mm->mm_children; c++) {
mirror_child_t *mc = &mm->mm_child[c];
int s = mc->mc_speculative;
error[s] = zio_worst_error(error[s], mc->mc_error);
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
+ NULL,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_vdev_mirror_switch_us, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs");
+#endif