X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_disk.c;h=28a4861abd7260c2c1feb7246fbcfe5d64fd71c4;hb=23bdb07d4e4c435205d25d3efdb5fef2d089ce5e;hp=9ae8fbc18b47b9153ef85d9f820a9718072ffbc9;hpb=d148e95156b98ac226013d24962d1afa4a51c712;p=zfs.git diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 9ae8fbc..28a4861 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -33,6 +33,8 @@ #include #include +char *zfs_vdev_scheduler = VDEV_SCHEDULER; + /* * Virtual device vector for disks. */ @@ -85,23 +87,77 @@ bdev_capacity(struct block_device *bdev) /* The partition capacity referenced by the block device */ if (part) - return part->nr_sects; + return (part->nr_sects << 9); /* Otherwise assume the full device capacity */ - return get_capacity(bdev->bd_disk); + return (get_capacity(bdev->bd_disk) << 9); } static void vdev_disk_error(zio_t *zio) { #ifdef ZFS_DEBUG - printk("ZFS: zio error=%d type=%d offset=%llu " - "size=%llu flags=%x\n", zio->io_error, zio->io_type, + printk("ZFS: zio error=%d type=%d offset=%llu size=%llu " + "flags=%x delay=%llu\n", zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, - zio->io_flags); + zio->io_flags, (u_longlong_t)zio->io_delay); #endif } +/* + * Use the Linux 'noop' elevator for zfs managed block devices. This + * strikes the ideal balance by allowing the zfs elevator to do all + * request ordering and prioritization. While allowing the Linux + * elevator to do the maximum front/back merging allowed by the + * physical device. This yields the largest possible requests for + * the device with the lowest total overhead. + * + * Unfortunately we cannot directly call the elevator_switch() function + * because it is not exported from the block layer. This means we have + * to use the sysfs interface and a user space upcall. Pools will be + * automatically imported on module load so we must do this at device + * open time from the kernel. + */ +#define SET_SCHEDULER_CMD \ + "exec 0/sys/block/%s/queue/scheduler " \ + " 2>/dev/null; " \ + "echo %s" + +static int +vdev_elevator_switch(vdev_t *v, char *elevator) +{ + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = vd->vd_bdev; + struct request_queue *q = bdev_get_queue(bdev); + char *device = bdev->bd_disk->disk_name; + char *argv[] = { "/bin/sh", "-c", NULL, NULL }; + char *envp[] = { NULL }; + int error; + + /* Skip devices which are not whole disks (partitions) */ + if (!v->vdev_wholedisk) + return (0); + + /* Skip devices without schedulers (loop, ram, dm, etc) */ + if (!q->elevator || !blk_queue_stackable(q)) + return (0); + + /* Leave existing scheduler when set to "none" */ + if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) + return (0); + + argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); + error = call_usermodehelper(argv[0], argv, envp, 1); + if (error) + printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", + elevator, v->vdev_path, device, error); + + strfree(argv[2]); + + return (error); +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) { @@ -162,11 +218,14 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) v->vdev_nowritecache = B_FALSE; /* Physical volume size in bytes */ - *psize = bdev_capacity(bdev) * block_size; + *psize = bdev_capacity(bdev); /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + /* Try to set the io scheduler elevator algorithm */ + (void) vdev_elevator_switch(v, zfs_vdev_scheduler); + return 0; } @@ -220,6 +279,27 @@ vdev_disk_dio_free(dio_request_t *dr) sizeof(struct bio *) * dr->dr_bio_count); } +static int +vdev_disk_dio_is_sync(dio_request_t *dr) +{ +#ifdef HAVE_BIO_RW_SYNC + /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ + return (dr->dr_rw & (1 << BIO_RW_SYNC)); +#else +# ifdef HAVE_BIO_RW_SYNCIO + /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ + return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); +# else +# ifdef HAVE_REQ_SYNC + /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ + return (dr->dr_rw & REQ_SYNC); +# else +# error "Unable to determine bio sync flag" +# endif /* HAVE_REQ_SYNC */ +# endif /* HAVE_BIO_RW_SYNC */ +#endif /* HAVE_BIO_RW_SYNCIO */ +} + static void vdev_disk_dio_get(dio_request_t *dr) { @@ -242,6 +322,8 @@ vdev_disk_dio_put(dio_request_t *dr) vdev_disk_dio_free(dr); if (zio) { + zio->io_delay = jiffies_to_msecs( + jiffies_64 - zio->io_delay); zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) @@ -282,7 +364,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) rc = vdev_disk_dio_put(dr); /* Wake up synchronous waiter this is the last outstanding bio */ - if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO))) + if ((rc == 1) && vdev_disk_dio_is_sync(dr)) complete(&dr->dr_comp); BIO_END_IO_RETURN(0); @@ -335,21 +417,20 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, caddr_t bio_ptr; uint64_t bio_offset; int bio_size, bio_count = 16; - int i = 0, error = 0, block_size; + int i = 0, error = 0; + + ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); retry: dr = vdev_disk_dio_alloc(bio_count); if (dr == NULL) return ENOMEM; + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bio_set_flags_failfast(bdev, &flags); + dr->dr_zio = zio; dr->dr_rw = flags; - block_size = vdev_bdev_block_size(bdev); - -#ifdef BIO_RW_FAILFAST - if (flags & (1 << BIO_RW_FAILFAST)) - dr->dr_rw |= 1 << BIO_RW_FAILFAST; -#endif /* BIO_RW_FAILFAST */ /* * When the IO size exceeds the maximum bio size for the request @@ -390,7 +471,7 @@ retry: vdev_disk_dio_get(dr); dr->dr_bio[i]->bi_bdev = bdev; - dr->dr_bio[i]->bi_sector = bio_offset / block_size; + dr->dr_bio[i]->bi_sector = bio_offset >> 9; dr->dr_bio[i]->bi_rw = dr->dr_rw; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; @@ -405,6 +486,8 @@ retry: /* Extra reference to protect dio_request during submit_bio */ vdev_disk_dio_get(dr); + if (zio) + zio->io_delay = jiffies_64; /* Submit all bio's associated with this dio */ for (i = 0; i < dr->dr_bio_count; i++) @@ -419,7 +502,7 @@ retry: * only synchronous consumer is vdev_disk_read_rootlabel() all other * IO originating from vdev_disk_io_start() is asynchronous. */ - if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) { + if (vdev_disk_dio_is_sync(dr)) { wait_for_completion(&dr->dr_comp); error = dr->dr_error; ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); @@ -434,6 +517,7 @@ int vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, size_t size, uint64_t offset, int flags) { + bio_set_flags_failfast(bdev, &flags); return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags); } @@ -443,6 +527,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc) { zio_t *zio = bio->bi_private; + zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay); zio->io_error = -rc; if (rc && (rc == -EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; @@ -473,7 +558,8 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio->bi_bdev = bdev; - submit_bio(WRITE_BARRIER, bio); + zio->io_delay = jiffies_64; + submit_bio(VDEV_WRITE_FLUSH_FUA, bio); return 0; } @@ -540,11 +626,6 @@ vdev_disk_io_start(zio_t *zio) return ZIO_PIPELINE_CONTINUE; } -#ifdef BIO_RW_FAILFAST - if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) - flags |= (1 << BIO_RW_FAILFAST); -#endif /* BIO_RW_FAILFAST */ - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, zio->io_size, zio->io_offset, flags); if (error) { @@ -633,7 +714,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) if (IS_ERR(bdev)) return -PTR_ERR(bdev); - s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev); + s = bdev_capacity(bdev); if (s == 0) { vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); return EIO; @@ -679,3 +760,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) return 0; } + +module_param(zfs_vdev_scheduler, charp, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");