X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_disk.c;h=aba3c4ab57ef284df3641bb3661312eef8cb2419;hb=4c0d8e50b99b4f3b4a9b7bc67ac7fc4e406f5755;hp=dbf9b08e763aee6ddf0949b3ac21ad8b9f01a993;hpb=60101509ee73c6e61e50c0a4079097f31bb39f4b;p=zfs.git

diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index dbf9b08..aba3c4a 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -33,6 +33,8 @@
 #include <sys/zio.h>
 #include <sys/sunldi.h>
 
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+
 /*
  * Virtual device vector for disks.
  */
@@ -91,6 +93,54 @@ bdev_capacity(struct block_device *bdev)
 	return get_capacity(bdev->bd_disk);
 }
 
+static void
+vdev_disk_error(zio_t *zio)
+{
+#ifdef ZFS_DEBUG
+	printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
+	    "flags=%x delay=%llu\n", zio->io_error, zio->io_type,
+	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
+	    zio->io_flags, (u_longlong_t)zio->io_delay);
+#endif
+}
+
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices.  This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization.  While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device.  This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ *
+ * Unfortunately we cannot directly call the elevator_switch() function
+ * because it is not exported from the block layer.  This means we have
+ * to use the sysfs interface and a user space upcall.  Pools will be
+ * automatically imported on module load so we must do this at device
+ * open time from the kernel.
+ */
+static int
+vdev_elevator_switch(vdev_t *v, char *elevator, char *device)
+{
+	char sh_path[] = "/bin/sh";
+	char sh_cmd[128];
+	char *argv[] = { sh_path, "-c", sh_cmd };
+	char *envp[] = { NULL };
+	int error;
+
+	if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+		return (0);
+
+	sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler",
+	    "/bin/echo", elevator, device);
+
+	error = call_usermodehelper(sh_path, argv, envp, 1);
+	if (error)
+		printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+		    elevator, v->vdev_path, device, error);
+
+	return (error);
+}
+
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 {
@@ -133,9 +183,19 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 	vd->vd_bdev = bdev;
 	block_size =  vdev_bdev_block_size(bdev);
 
-	/* Check if this is a whole device.  When bdev->bd_contains ==
-	 * bdev we have a whole device and not simply a partition. */
-	v->vdev_wholedisk = !!(bdev->bd_contains == bdev);
+	/* We think the wholedisk property should always be set when this
+	 * function is called.  ASSERT here so if any legitimate cases exist
+	 * where it's not set, we'll find them during debugging.  If we never
+	 * hit the ASSERT, this and the following conditional statement can be
+	 * removed. */
+	ASSERT3S(v->vdev_wholedisk, !=, -1ULL);
+
+	/* The wholedisk property was initialized to -1 in vdev_alloc() if it
+	 * was unspecified.  In that case, check if this is a whole device.
+	 * When bdev->bd_contains == bdev we have a whole device and not simply
+	 * a partition. */
+	if (v->vdev_wholedisk == -1ULL)
+		v->vdev_wholedisk = (bdev->bd_contains == bdev);
 
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	v->vdev_nowritecache = B_FALSE;
@@ -146,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
 
+	/* Try to set the io scheduler elevator algorithm */
+	(void) vdev_elevator_switch(v, zfs_vdev_scheduler,
+	    bdev->bd_disk->disk_name);
+
 	return 0;
 }
 
@@ -199,6 +263,27 @@ vdev_disk_dio_free(dio_request_t *dr)
 	          sizeof(struct bio *) * dr->dr_bio_count);
 }
 
+static int
+vdev_disk_dio_is_sync(dio_request_t *dr)
+{
+#ifdef HAVE_BIO_RW_SYNC
+	/* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */
+        return (dr->dr_rw & (1 << BIO_RW_SYNC));
+#else
+# ifdef HAVE_BIO_RW_SYNCIO
+	/* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */
+        return (dr->dr_rw & (1 << BIO_RW_SYNCIO));
+# else
+#  ifdef HAVE_REQ_SYNC
+	/* REQ_SYNC preferred interface from 2.6.36-2.6.xx */
+        return (dr->dr_rw & REQ_SYNC);
+#  else
+#   error "Unable to determine bio sync flag"
+#  endif /* HAVE_REQ_SYNC */
+# endif /* HAVE_BIO_RW_SYNC */
+#endif /* HAVE_BIO_RW_SYNCIO */
+}
+
 static void
 vdev_disk_dio_get(dio_request_t *dr)
 {
@@ -221,7 +306,12 @@ vdev_disk_dio_put(dio_request_t *dr)
 		vdev_disk_dio_free(dr);
 
 		if (zio) {
+			zio->io_delay = jiffies_to_msecs(
+			    jiffies_64 - zio->io_delay);
 			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
 			zio_interrupt(zio);
 		}
 	}
@@ -249,16 +339,16 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error)
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 
 	if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = EIO;
+		error = -EIO;
 
 	if (dr->dr_error == 0)
-		dr->dr_error = error;
+		dr->dr_error = -error;
 
 	/* Drop reference aquired by __vdev_disk_physio */
 	rc = vdev_disk_dio_put(dr);
 
 	/* Wake up synchronous waiter this is the last outstanding bio */
-	if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO)))
+	if ((rc == 1) && vdev_disk_dio_is_sync(dr))
 		complete(&dr->dr_comp);
 
 	BIO_END_IO_RETURN(0);
@@ -313,20 +403,20 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
 	int bio_size, bio_count = 16;
 	int i = 0, error = 0, block_size;
 
+	ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
+
 retry:
 	dr = vdev_disk_dio_alloc(bio_count);
 	if (dr == NULL)
 		return ENOMEM;
 
+	if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+			bio_set_flags_failfast(bdev, &flags);
+
 	dr->dr_zio = zio;
 	dr->dr_rw = flags;
 	block_size = vdev_bdev_block_size(bdev);
 
-#ifdef BIO_RW_FAILFAST
-	if (flags & (1 << BIO_RW_FAILFAST))
-		dr->dr_rw |= 1 << BIO_RW_FAILFAST;
-#endif /* BIO_RW_FAILFAST */
-
 	/*
 	 * When the IO size exceeds the maximum bio size for the request
 	 * queue we are forced to break the IO in multiple bio's and wait
@@ -381,6 +471,8 @@ retry:
 
 	/* Extra reference to protect dio_request during submit_bio */
 	vdev_disk_dio_get(dr);
+	if (zio)
+		zio->io_delay = jiffies_64;
 
 	/* Submit all bio's associated with this dio */
 	for (i = 0; i < dr->dr_bio_count; i++)
@@ -395,7 +487,7 @@ retry:
 	 * only synchronous consumer is vdev_disk_read_rootlabel() all other
 	 * IO originating from vdev_disk_io_start() is asynchronous.
 	 */
-	if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
+	if (vdev_disk_dio_is_sync(dr)) {
 		wait_for_completion(&dr->dr_comp);
 		error = dr->dr_error;
 		ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
@@ -410,6 +502,7 @@ int
 vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
 		 size_t size, uint64_t offset, int flags)
 {
+	bio_set_flags_failfast(bdev, &flags);
 	return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
 }
 
@@ -419,11 +512,15 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
 {
 	zio_t *zio = bio->bi_private;
 
+	zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
 	zio->io_error = -rc;
 	if (rc && (rc == -EOPNOTSUPP))
 		zio->io_vd->vdev_nowritecache = B_TRUE;
 
 	bio_put(bio);
+	ASSERT3S(zio->io_error, >=, 0);
+	if (zio->io_error)
+		vdev_disk_error(zio);
 	zio_interrupt(zio);
 
 	BIO_END_IO_RETURN(0);
@@ -446,6 +543,7 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	bio->bi_end_io = vdev_disk_io_flush_completion;
 	bio->bi_private = zio;
 	bio->bi_bdev = bdev;
+	zio->io_delay = jiffies_64;
 	submit_bio(WRITE_BARRIER, bio);
 
 	return 0;
@@ -513,11 +611,6 @@ vdev_disk_io_start(zio_t *zio)
 		return ZIO_PIPELINE_CONTINUE;
 	}
 
-#ifdef BIO_RW_FAILFAST
-	if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))
-		flags |= (1 << BIO_RW_FAILFAST);
-#endif /* BIO_RW_FAILFAST */
-
 	error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
 		                   zio->io_size, zio->io_offset, flags);
 	if (error) {
@@ -652,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 
 	return 0;
 }
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");