X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_disk.c;h=aba3c4ab57ef284df3641bb3661312eef8cb2419;hb=4c0d8e50b99b4f3b4a9b7bc67ac7fc4e406f5755;hp=51062ef23529d3489e954b7a9d1f270603b37803;hpb=a69052be7f9a4008e2b09578e9db5fdebc186111;p=zfs.git diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 51062ef..aba3c4a 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -33,6 +33,8 @@ #include #include +char *zfs_vdev_scheduler = VDEV_SCHEDULER; + /* * Virtual device vector for disks. */ @@ -102,6 +104,43 @@ vdev_disk_error(zio_t *zio) #endif } +/* + * Use the Linux 'noop' elevator for zfs managed block devices. This + * strikes the ideal balance by allowing the zfs elevator to do all + * request ordering and prioritization. While allowing the Linux + * elevator to do the maximum front/back merging allowed by the + * physical device. This yields the largest possible requests for + * the device with the lowest total overhead. + * + * Unfortunately we cannot directly call the elevator_switch() function + * because it is not exported from the block layer. This means we have + * to use the sysfs interface and a user space upcall. Pools will be + * automatically imported on module load so we must do this at device + * open time from the kernel. + */ +static int +vdev_elevator_switch(vdev_t *v, char *elevator, char *device) +{ + char sh_path[] = "/bin/sh"; + char sh_cmd[128]; + char *argv[] = { sh_path, "-c", sh_cmd }; + char *envp[] = { NULL }; + int error; + + if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) + return (0); + + sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler", + "/bin/echo", elevator, device); + + error = call_usermodehelper(sh_path, argv, envp, 1); + if (error) + printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", + elevator, v->vdev_path, device, error); + + return (error); +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) { @@ -167,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + /* Try to set the io scheduler elevator algorithm */ + (void) vdev_elevator_switch(v, zfs_vdev_scheduler, + bdev->bd_disk->disk_name); + return 0; } @@ -220,6 +263,27 @@ vdev_disk_dio_free(dio_request_t *dr) sizeof(struct bio *) * dr->dr_bio_count); } +static int +vdev_disk_dio_is_sync(dio_request_t *dr) +{ +#ifdef HAVE_BIO_RW_SYNC + /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ + return (dr->dr_rw & (1 << BIO_RW_SYNC)); +#else +# ifdef HAVE_BIO_RW_SYNCIO + /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ + return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); +# else +# ifdef HAVE_REQ_SYNC + /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ + return (dr->dr_rw & REQ_SYNC); +# else +# error "Unable to determine bio sync flag" +# endif /* HAVE_REQ_SYNC */ +# endif /* HAVE_BIO_RW_SYNC */ +#endif /* HAVE_BIO_RW_SYNCIO */ +} + static void vdev_disk_dio_get(dio_request_t *dr) { @@ -284,7 +348,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) rc = vdev_disk_dio_put(dr); /* Wake up synchronous waiter this is the last outstanding bio */ - if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO))) + if ((rc == 1) && vdev_disk_dio_is_sync(dr)) complete(&dr->dr_comp); BIO_END_IO_RETURN(0); @@ -339,6 +403,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, int bio_size, bio_count = 16; int i = 0, error = 0, block_size; + ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + retry: dr = vdev_disk_dio_alloc(bio_count); if (dr == NULL) @@ -421,7 +487,7 @@ retry: * only synchronous consumer is vdev_disk_read_rootlabel() all other * IO originating from vdev_disk_io_start() is asynchronous. */ - if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) { + if (vdev_disk_dio_is_sync(dr)) { wait_for_completion(&dr->dr_comp); error = dr->dr_error; ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); @@ -679,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) return 0; } + +module_param(zfs_vdev_scheduler, charp, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");