Conserve stack in zfs_setattr()

[zfs.git] / module / zfs / vdev_disk.c
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c

index 9ae8fbc..69bc53b 100644 (file)
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -33,6 +33,8 @@
  #include <sys/zio.h>
  #include <sys/sunldi.h>
  
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+
  /*
   * Virtual device vector for disks.
   */
@@ -95,13 +97,68 @@ static void
  vdev_disk_error(zio_t *zio)
  {
  #ifdef ZFS_DEBUG
-       printk("ZFS: zio error=%d type=%d offset=%llu "
-           "size=%llu flags=%x\n", zio->io_error, zio->io_type,
+       printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
+           "flags=%x delay=%llu\n", zio->io_error, zio->io_type,
             (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
-           zio->io_flags);
+           zio->io_flags, (u_longlong_t)zio->io_delay);
  #endif
  }
  
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices.  This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization.  While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device.  This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ *
+ * Unfortunately we cannot directly call the elevator_switch() function
+ * because it is not exported from the block layer.  This means we have
+ * to use the sysfs interface and a user space upcall.  Pools will be
+ * automatically imported on module load so we must do this at device
+ * open time from the kernel.
+ */
+static int
+vdev_elevator_switch(vdev_t *v, char *elevator)
+{
+       vdev_disk_t *vd = v->vdev_tsd;
+       struct block_device *bdev = vd->vd_bdev;
+       struct request_queue *q = bdev_get_queue(bdev);
+       char *device = bdev->bd_disk->disk_name;
+       char sh_path[] = "/bin/sh";
+       char sh_cmd[128];
+       char *argv[] = { sh_path, "-c", sh_cmd };
+       char *envp[] = { NULL };
+       int count = 0, error;
+
+       /* Skip devices without schedulers (loop, ram, dm, etc) */
+       if (!q->elevator || !blk_queue_stackable(q))
+               return (0);
+
+       /* Leave existing scheduler when set to "none" */
+       if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+               return (0);
+
+       /*
+        * Set the desired scheduler with a three attempt retry for
+        * -EFAULT which has been observed to occur spuriously.
+        */
+       sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler",
+           "/bin/echo", elevator, device);
+
+       while (++count <= 3) {
+               error = call_usermodehelper(sh_path, argv, envp, 1);
+               if ((error == 0) || (error != -EFAULT))
+                      break;
+       }
+
+       if (error)
+               printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+                      elevator, v->vdev_path, device, error);
+
+       return (error);
+}
+
  static int
  vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
  {
@@ -167,6 +224,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
         /* Based on the minimum sector size set the block size */
         *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
  
+       /* Try to set the io scheduler elevator algorithm */
+       (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
+
         return 0;
  }
  
@@ -220,6 +280,27 @@ vdev_disk_dio_free(dio_request_t *dr)
                   sizeof(struct bio *) * dr->dr_bio_count);
  }
  
+static int
+vdev_disk_dio_is_sync(dio_request_t *dr)
+{
+#ifdef HAVE_BIO_RW_SYNC
+       /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */
+        return (dr->dr_rw & (1 << BIO_RW_SYNC));
+#else
+# ifdef HAVE_BIO_RW_SYNCIO
+       /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */
+        return (dr->dr_rw & (1 << BIO_RW_SYNCIO));
+# else
+#  ifdef HAVE_REQ_SYNC
+       /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */
+        return (dr->dr_rw & REQ_SYNC);
+#  else
+#   error "Unable to determine bio sync flag"
+#  endif /* HAVE_REQ_SYNC */
+# endif /* HAVE_BIO_RW_SYNC */
+#endif /* HAVE_BIO_RW_SYNCIO */
+}
+
  static void
  vdev_disk_dio_get(dio_request_t *dr)
  {
@@ -242,6 +323,8 @@ vdev_disk_dio_put(dio_request_t *dr)
                 vdev_disk_dio_free(dr);
  
                 if (zio) {
+                       zio->io_delay = jiffies_to_msecs(
+                           jiffies_64 - zio->io_delay);
                         zio->io_error = error;
                         ASSERT3S(zio->io_error, >=, 0);
                         if (zio->io_error)
@@ -282,7 +365,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error)
         rc = vdev_disk_dio_put(dr);
  
         /* Wake up synchronous waiter this is the last outstanding bio */
-       if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO)))
+       if ((rc == 1) && vdev_disk_dio_is_sync(dr))
                 complete(&dr->dr_comp);
  
         BIO_END_IO_RETURN(0);
@@ -337,20 +420,20 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
         int bio_size, bio_count = 16;
         int i = 0, error = 0, block_size;
  
+       ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
+
  retry:
         dr = vdev_disk_dio_alloc(bio_count);
         if (dr == NULL)
                 return ENOMEM;
  
+       if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+                       bio_set_flags_failfast(bdev, &flags);
+
         dr->dr_zio = zio;
         dr->dr_rw = flags;
         block_size = vdev_bdev_block_size(bdev);
  
-#ifdef BIO_RW_FAILFAST
-       if (flags & (1 << BIO_RW_FAILFAST))
-               dr->dr_rw |= 1 << BIO_RW_FAILFAST;
-#endif /* BIO_RW_FAILFAST */
-
         /*
          * When the IO size exceeds the maximum bio size for the request
          * queue we are forced to break the IO in multiple bio's and wait
@@ -405,6 +488,8 @@ retry:
  
         /* Extra reference to protect dio_request during submit_bio */
         vdev_disk_dio_get(dr);
+       if (zio)
+               zio->io_delay = jiffies_64;
  
         /* Submit all bio's associated with this dio */
         for (i = 0; i < dr->dr_bio_count; i++)
@@ -419,7 +504,7 @@ retry:
          * only synchronous consumer is vdev_disk_read_rootlabel() all other
          * IO originating from vdev_disk_io_start() is asynchronous.
          */
-       if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
+       if (vdev_disk_dio_is_sync(dr)) {
                 wait_for_completion(&dr->dr_comp);
                 error = dr->dr_error;
                 ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
@@ -434,6 +519,7 @@ int
  vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
                  size_t size, uint64_t offset, int flags)
  {
+       bio_set_flags_failfast(bdev, &flags);
         return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
  }
  
@@ -443,6 +529,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
  {
         zio_t *zio = bio->bi_private;
  
+       zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
         zio->io_error = -rc;
         if (rc && (rc == -EOPNOTSUPP))
                 zio->io_vd->vdev_nowritecache = B_TRUE;
@@ -473,6 +560,7 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
         bio->bi_end_io = vdev_disk_io_flush_completion;
         bio->bi_private = zio;
         bio->bi_bdev = bdev;
+       zio->io_delay = jiffies_64;
         submit_bio(WRITE_BARRIER, bio);
  
         return 0;
@@ -540,11 +628,6 @@ vdev_disk_io_start(zio_t *zio)
                 return ZIO_PIPELINE_CONTINUE;
         }
  
-#ifdef BIO_RW_FAILFAST
-       if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))
-               flags |= (1 << BIO_RW_FAILFAST);
-#endif /* BIO_RW_FAILFAST */
-
         error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
                                    zio->io_size, zio->io_offset, flags);
         if (error) {
@@ -679,3 +762,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
  
         return 0;
  }
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");