+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices. This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization. While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device. This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ *
+ * Unfortunately we cannot directly call the elevator_switch() function
+ * because it is not exported from the block layer. This means we have
+ * to use the sysfs interface and a user space upcall. Pools will be
+ * automatically imported on module load so we must do this at device
+ * open time from the kernel.
+ */
+static int
+vdev_elevator_switch(vdev_t *v, char *elevator)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+ struct block_device *bdev = vd->vd_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ char *device = bdev->bd_disk->disk_name;
+ char sh_path[] = "/bin/sh";
+ char sh_cmd[128];
+ char *argv[] = { sh_path, "-c", sh_cmd };
+ char *envp[] = { NULL };
+ int count = 0, error;
+
+ /* Skip devices which are not whole disks (partitions) */
+ if (!v->vdev_wholedisk)
+ return (0);
+
+ /* Skip devices without schedulers (loop, ram, dm, etc) */
+ if (!q->elevator || !blk_queue_stackable(q))
+ return (0);
+
+ /* Leave existing scheduler when set to "none" */
+ if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+ return (0);
+
+ /*
+ * Set the desired scheduler with a three attempt retry for
+ * -EFAULT which has been observed to occur spuriously.
+ */
+ sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler",
+ "/bin/echo", elevator, device);
+
+ while (++count <= 3) {
+ error = call_usermodehelper(sh_path, argv, envp, 1);
+ if ((error == 0) || (error != -EFAULT))
+ break;
+ }
+
+ if (error)
+ printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+ elevator, v->vdev_path, device, error);
+
+ return (error);
+}
+