#include <sys/zio.h>
#include <sys/sunldi.h>
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+
/*
* Virtual device vector for disks.
*/
/* The partition capacity referenced by the block device */
if (part)
- return part->nr_sects;
+ return (part->nr_sects << 9);
/* Otherwise assume the full device capacity */
- return get_capacity(bdev->bd_disk);
+ return (get_capacity(bdev->bd_disk) << 9);
}
static void
#endif
}
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices. This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization. While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device. This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ *
+ * Unfortunately we cannot directly call the elevator_switch() function
+ * because it is not exported from the block layer. This means we have
+ * to use the sysfs interface and a user space upcall. Pools will be
+ * automatically imported on module load so we must do this at device
+ * open time from the kernel.
+ */
+#define SET_SCHEDULER_CMD \
+ "exec 0</dev/null " \
+ " 1>/sys/block/%s/queue/scheduler " \
+ " 2>/dev/null; " \
+ "echo %s"
+
+static int
+vdev_elevator_switch(vdev_t *v, char *elevator)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+ struct block_device *bdev = vd->vd_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ char *device = bdev->bd_disk->disk_name;
+ char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+ char *envp[] = { NULL };
+ int error;
+
+ /* Skip devices which are not whole disks (partitions) */
+ if (!v->vdev_wholedisk)
+ return (0);
+
+ /* Skip devices without schedulers (loop, ram, dm, etc) */
+ if (!q->elevator || !blk_queue_stackable(q))
+ return (0);
+
+ /* Leave existing scheduler when set to "none" */
+ if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+ return (0);
+
+ argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+ error = call_usermodehelper(argv[0], argv, envp, 1);
+ if (error)
+ printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+ elevator, v->vdev_path, device, error);
+
+ strfree(argv[2]);
+
+ return (error);
+}
+
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
{
v->vdev_nowritecache = B_FALSE;
/* Physical volume size in bytes */
- *psize = bdev_capacity(bdev) * block_size;
+ *psize = bdev_capacity(bdev);
/* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+ /* Try to set the io scheduler elevator algorithm */
+ (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
+
return 0;
}
caddr_t bio_ptr;
uint64_t bio_offset;
int bio_size, bio_count = 16;
- int i = 0, error = 0, block_size;
+ int i = 0, error = 0;
ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
dr->dr_zio = zio;
dr->dr_rw = flags;
- block_size = vdev_bdev_block_size(bdev);
/*
* When the IO size exceeds the maximum bio size for the request
vdev_disk_dio_get(dr);
dr->dr_bio[i]->bi_bdev = bdev;
- dr->dr_bio[i]->bi_sector = bio_offset / block_size;
+ dr->dr_bio[i]->bi_sector = bio_offset >> 9;
dr->dr_bio[i]->bi_rw = dr->dr_rw;
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
dr->dr_bio[i]->bi_private = dr;
bio->bi_private = zio;
bio->bi_bdev = bdev;
zio->io_delay = jiffies_64;
- submit_bio(WRITE_BARRIER, bio);
+ submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
return 0;
}
if (IS_ERR(bdev))
return -PTR_ERR(bdev);
- s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev);
+ s = bdev_capacity(bdev);
if (s == 0) {
vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
return EIO;
return 0;
}
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");