#include <sys/zio.h>
#include <sys/sunldi.h>
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+
/*
* Virtual device vector for disks.
*/
/* The partition capacity referenced by the block device */
if (part)
- return part->nr_sects;
+ return (part->nr_sects << 9);
/* Otherwise assume the full device capacity */
- return get_capacity(bdev->bd_disk);
+ return (get_capacity(bdev->bd_disk) << 9);
}
static void
vdev_disk_error(zio_t *zio)
{
#ifdef ZFS_DEBUG
- printk("ZFS: zio error=%d type=%d offset=%llu "
- "size=%llu flags=%x\n", zio->io_error, zio->io_type,
+ printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
+ "flags=%x delay=%llu\n", zio->io_error, zio->io_type,
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
- zio->io_flags);
+ zio->io_flags, (u_longlong_t)zio->io_delay);
#endif
}
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices. This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization. While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device. This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ */
static int
-vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
+vdev_elevator_switch(vdev_t *v, char *elevator)
{
- struct block_device *bdev;
+ vdev_disk_t *vd = v->vdev_tsd;
+ struct block_device *bdev = vd->vd_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ char *device = bdev->bd_disk->disk_name;
+ int error;
+
+ /* Skip devices which are not whole disks (partitions) */
+ if (!v->vdev_wholedisk)
+ return (0);
+
+ /* Skip devices without schedulers (loop, ram, dm, etc) */
+ if (!q->elevator || !blk_queue_stackable(q))
+ return (0);
+
+ /* Leave existing scheduler when set to "none" */
+ if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+ return (0);
+
+#ifdef HAVE_ELEVATOR_CHANGE
+ error = elevator_change(q, elevator);
+#else
+ /* For pre-2.6.36 kernels elevator_change() is not available.
+ * Therefore we fall back to using a usermodehelper to echo the
+ * elevator into sysfs; This requires /bin/echo and sysfs to be
+ * mounted which may not be true early in the boot process.
+ */
+# define SET_SCHEDULER_CMD \
+ "exec 0</dev/null " \
+ " 1>/sys/block/%s/queue/scheduler " \
+ " 2>/dev/null; " \
+ "echo %s"
+
+ {
+ char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+ char *envp[] = { NULL };
+
+ argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+ error = call_usermodehelper(argv[0], argv, envp, 1);
+ strfree(argv[2]);
+ }
+#endif /* HAVE_ELEVATOR_CHANGE */
+ if (error)
+ printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+ elevator, v->vdev_path, device, error);
+
+ return (error);
+}
+
+/*
+ * Expanding a whole disk vdev involves invoking BLKRRPART on the
+ * whole disk device. This poses a problem, because BLKRRPART will
+ * return EBUSY if one of the disk's partitions is open. That's why
+ * we have to do it here, just before opening the data partition.
+ * Unfortunately, BLKRRPART works by dropping all partitions and
+ * recreating them, which means that for a short time window, all
+ * /dev/sdxN device files disappear (until udev recreates them).
+ * This means two things:
+ * - When we open the data partition just after a BLKRRPART, we
+ * can't do it using the normal device file path because of the
+ * obvious race condition with udev. Instead, we use reliable
+ * kernel APIs to get a handle to the new partition device from
+ * the whole disk device.
+ * - Because vdev_disk_open() initially needs to find the device
+ * using its path, multiple vdev_disk_open() invocations in
+ * short succession on the same disk with BLKRRPARTs in the
+ * middle have a high probability of failure (because of the
+ * race condition with udev). A typical situation where this
+ * might happen is when the zpool userspace tool does a
+ * TRYIMPORT immediately followed by an IMPORT. For this
+ * reason, we only invoke BLKRRPART in the module when strictly
+ * necessary (zpool online -e case), and rely on userspace to
+ * do it when possible.
+ */
+static struct block_device *
+vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
+{
+#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
+ struct block_device *bdev, *result = ERR_PTR(-ENXIO);
+ struct gendisk *disk;
+ int error, partno;
+
+ bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), vd);
+ if (IS_ERR(bdev))
+ return bdev;
+
+ disk = get_gendisk(bdev->bd_dev, &partno);
+ vdev_bdev_close(bdev, vdev_bdev_mode(mode));
+
+ if (disk) {
+ bdev = bdget(disk_devt(disk));
+ if (bdev) {
+ error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
+ if (error == 0)
+ error = ioctl_by_bdev(bdev, BLKRRPART, 0);
+ vdev_bdev_close(bdev, vdev_bdev_mode(mode));
+ }
+
+ bdev = bdget_disk(disk, partno);
+ if (bdev) {
+ error = blkdev_get(bdev,
+ vdev_bdev_mode(mode) | FMODE_EXCL, vd);
+ if (error == 0)
+ result = bdev;
+ }
+ put_disk(disk);
+ }
+
+ return result;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *ashift)
+{
+ struct block_device *bdev = ERR_PTR(-ENXIO);
vdev_disk_t *vd;
int mode, block_size;
return EINVAL;
}
- vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
+ vd = kmem_zalloc(sizeof(vdev_disk_t), KM_PUSHPAGE);
if (vd == NULL)
return ENOMEM;
* level vdev validation.
*/
mode = spa_mode(v->vdev_spa);
- bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
+ if (v->vdev_wholedisk && v->vdev_expanding)
+ bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
+ if (IS_ERR(bdev))
+ bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
if (IS_ERR(bdev)) {
kmem_free(vd, sizeof(vdev_disk_t));
return -PTR_ERR(bdev);
v->vdev_nowritecache = B_FALSE;
/* Physical volume size in bytes */
- *psize = bdev_capacity(bdev) * block_size;
+ *psize = bdev_capacity(bdev);
+
+ /* TODO: report possible expansion size */
+ *max_psize = *psize;
/* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+ /* Try to set the io scheduler elevator algorithm */
+ (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
+
return 0;
}
int i;
dr = kmem_zalloc(sizeof(dio_request_t) +
- sizeof(struct bio *) * bio_count, KM_SLEEP);
+ sizeof(struct bio *) * bio_count, KM_PUSHPAGE);
if (dr) {
init_completion(&dr->dr_comp);
atomic_set(&dr->dr_ref, 0);
sizeof(struct bio *) * dr->dr_bio_count);
}
+static int
+vdev_disk_dio_is_sync(dio_request_t *dr)
+{
+#ifdef HAVE_BIO_RW_SYNC
+ /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */
+ return (dr->dr_rw & (1 << BIO_RW_SYNC));
+#else
+# ifdef HAVE_BIO_RW_SYNCIO
+ /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */
+ return (dr->dr_rw & (1 << BIO_RW_SYNCIO));
+# else
+# ifdef HAVE_REQ_SYNC
+ /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */
+ return (dr->dr_rw & REQ_SYNC);
+# else
+# error "Unable to determine bio sync flag"
+# endif /* HAVE_REQ_SYNC */
+# endif /* HAVE_BIO_RW_SYNC */
+#endif /* HAVE_BIO_RW_SYNCIO */
+}
+
static void
vdev_disk_dio_get(dio_request_t *dr)
{
vdev_disk_dio_free(dr);
if (zio) {
+ zio->io_delay = jiffies_to_msecs(
+ jiffies_64 - zio->io_delay);
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
rc = vdev_disk_dio_put(dr);
/* Wake up synchronous waiter this is the last outstanding bio */
- if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO)))
+ if ((rc == 1) && vdev_disk_dio_is_sync(dr))
complete(&dr->dr_comp);
BIO_END_IO_RETURN(0);
caddr_t bio_ptr;
uint64_t bio_offset;
int bio_size, bio_count = 16;
- int i = 0, error = 0, block_size;
+ int i = 0, error = 0;
+
+ ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
retry:
dr = vdev_disk_dio_alloc(bio_count);
if (dr == NULL)
return ENOMEM;
+ if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+ bio_set_flags_failfast(bdev, &flags);
+
dr->dr_zio = zio;
dr->dr_rw = flags;
- block_size = vdev_bdev_block_size(bdev);
-
-#ifdef BIO_RW_FAILFAST
- if (flags & (1 << BIO_RW_FAILFAST))
- dr->dr_rw |= 1 << BIO_RW_FAILFAST;
-#endif /* BIO_RW_FAILFAST */
/*
* When the IO size exceeds the maximum bio size for the request
vdev_disk_dio_get(dr);
dr->dr_bio[i]->bi_bdev = bdev;
- dr->dr_bio[i]->bi_sector = bio_offset / block_size;
+ dr->dr_bio[i]->bi_sector = bio_offset >> 9;
dr->dr_bio[i]->bi_rw = dr->dr_rw;
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
dr->dr_bio[i]->bi_private = dr;
/* Extra reference to protect dio_request during submit_bio */
vdev_disk_dio_get(dr);
+ if (zio)
+ zio->io_delay = jiffies_64;
/* Submit all bio's associated with this dio */
for (i = 0; i < dr->dr_bio_count; i++)
* only synchronous consumer is vdev_disk_read_rootlabel() all other
* IO originating from vdev_disk_io_start() is asynchronous.
*/
- if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
+ if (vdev_disk_dio_is_sync(dr)) {
wait_for_completion(&dr->dr_comp);
error = dr->dr_error;
ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
size_t size, uint64_t offset, int flags)
{
+ bio_set_flags_failfast(bdev, &flags);
return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
}
{
zio_t *zio = bio->bi_private;
+ zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
zio->io_error = -rc;
if (rc && (rc == -EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE;
bio->bi_end_io = vdev_disk_io_flush_completion;
bio->bi_private = zio;
bio->bi_bdev = bdev;
- submit_bio(WRITE_BARRIER, bio);
+ zio->io_delay = jiffies_64;
+ submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
return 0;
}
return ZIO_PIPELINE_CONTINUE;
}
-#ifdef BIO_RW_FAILFAST
- if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))
- flags |= (1 << BIO_RW_FAILFAST);
-#endif /* BIO_RW_FAILFAST */
-
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
zio->io_size, zio->io_offset, flags);
if (error) {
if (IS_ERR(bdev))
return -PTR_ERR(bdev);
- s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev);
+ s = bdev_capacity(bdev);
if (s == 0) {
vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
return EIO;
}
size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t);
- label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP);
+ label = vmem_alloc(sizeof(vdev_label_t), KM_PUSHPAGE);
for (i = 0; i < VDEV_LABELS; i++) {
uint64_t offset, state, txg = 0;
return 0;
}
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");