X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_mirror.c;h=e0884dc3ce63915ffae6509563cb09eefa5acc54;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=698c0275d34ee75e950156af6ae3961d29265d0d;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git

diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 698c027..e0884dc 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -37,6 +41,7 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_pending;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
@@ -50,7 +55,23 @@ typedef struct mirror_map {
 	mirror_child_t	mm_child[1];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+/*
+ * When the children are equally busy queue incoming requests to a single
+ * child for N microseconds.  This is done to maximize the likelihood that
+ * the Linux elevator will be able to merge requests while it is plugged.
+ * Otherwise, requests are queued to the least busy device.
+ *
+ * For rotational disks the Linux elevator will plug for 10ms which is
+ * why zfs_vdev_mirror_switch_us is set to 10ms by default.  For non-
+ * rotational disks the elevator will not plug, but 10ms is still a small
+ * enough value that the requests will get spread over all the children.
+ *
+ * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us
+ * significantly to bound the worst case latencies.  It would probably be
+ * ideal to calculate a decaying average of the last observed latencies and
+ * use that to dynamically adjust the zfs_vdev_mirror_switch_us time.
+ */
+int zfs_vdev_mirror_switch_us = 10000;
 
 static void
 vdev_mirror_map_free(zio_t *zio)
@@ -65,6 +86,19 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_pending(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	int pending;
+
+	mutex_enter(&vq->vq_lock);
+	pending = avl_numnodes(&vq->vq_pending_tree);
+	mutex_exit(&vq->vq_lock);
+
+	return (pending);
+}
+
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
@@ -79,7 +113,7 @@ vdev_mirror_map_alloc(zio_t *zio)
 
 		c = BP_GET_NDVAS(zio->io_bp);
 
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
 		mm->mm_children = c;
 		mm->mm_replacing = B_FALSE;
 		mm->mm_preferred = spa_get_random(c);
@@ -104,20 +138,55 @@ vdev_mirror_map_alloc(zio_t *zio)
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
+		int lowest_pending = INT_MAX;
+		int lowest_nr = 1;
+
 		c = vd->vdev_children;
 
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
 		mm->mm_children = c;
 		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
+		mm->mm_preferred = 0;
 		mm->mm_root = B_FALSE;
 
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
+
+			if (mm->mm_replacing)
+				continue;
+
+			if (!vdev_readable(mc->mc_vd)) {
+				mc->mc_error = ENXIO;
+				mc->mc_tried = 1;
+				mc->mc_skipped = 1;
+				mc->mc_pending = INT_MAX;
+				continue;
+			}
+
+			mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
+			if (mc->mc_pending < lowest_pending) {
+				lowest_pending = mc->mc_pending;
+				lowest_nr = 1;
+			} else if (mc->mc_pending == lowest_pending) {
+				lowest_nr++;
+			}
+		}
+
+		d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us);
+		d = (d % lowest_nr) + 1;
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+
+			if (mm->mm_child[c].mc_pending == lowest_pending) {
+				if (--d == 0) {
+					mm->mm_preferred = c;
+					break;
+				}
+			}
 		}
 	}
 
@@ -127,10 +196,12 @@ vdev_mirror_map_alloc(zio_t *zio)
 }
 
 static int
-vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *ashift)
 {
 	int numerrors = 0;
 	int lasterror = 0;
+	int c;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
@@ -139,7 +210,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 
 	vdev_open_children(vd);
 
-	for (int c = 0; c < vd->vdev_children; c++) {
+	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error) {
@@ -149,6 +220,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*ashift = MAX(*ashift, cvd->vdev_ashift);
 	}
 
@@ -163,7 +235,9 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 static void
 vdev_mirror_close(vdev_t *vd)
 {
-	for (int c = 0; c < vd->vdev_children; c++)
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
@@ -311,9 +385,9 @@ vdev_mirror_io_start(zio_t *zio)
 static int
 vdev_mirror_worst_error(mirror_map_t *mm)
 {
-	int error[2] = { 0, 0 };
+	int c, error[2] = { 0, 0 };
 
-	for (int c = 0; c < mm->mm_children; c++) {
+	for (c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc = &mm->mm_child[c];
 		int s = mc->mc_speculative;
 		error[s] = zio_worst_error(error[s], mc->mc_error);
@@ -483,3 +557,8 @@ vdev_ops_t vdev_spare_ops = {
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_vdev_mirror_switch_us, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs");
+#endif