X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_disk.c;h=31d1a28bdc9bfff35392cbd35e9a69915bd2447c;hb=cc92e9d0c3e67a7e66c844466f85696a087bf60a;hp=51062ef23529d3489e954b7a9d1f270603b37803;hpb=a69052be7f9a4008e2b09578e9db5fdebc186111;p=zfs.git diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 51062ef..31d1a28 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -33,6 +33,9 @@ #include #include +char *zfs_vdev_scheduler = VDEV_SCHEDULER; +static void *zfs_vdev_holder = VDEV_HOLDER; + /* * Virtual device vector for disks. */ @@ -85,10 +88,10 @@ bdev_capacity(struct block_device *bdev) /* The partition capacity referenced by the block device */ if (part) - return part->nr_sects; + return (part->nr_sects << 9); /* Otherwise assume the full device capacity */ - return get_capacity(bdev->bd_disk); + return (get_capacity(bdev->bd_disk) << 9); } static void @@ -102,10 +105,142 @@ vdev_disk_error(zio_t *zio) #endif } +/* + * Use the Linux 'noop' elevator for zfs managed block devices. This + * strikes the ideal balance by allowing the zfs elevator to do all + * request ordering and prioritization. While allowing the Linux + * elevator to do the maximum front/back merging allowed by the + * physical device. This yields the largest possible requests for + * the device with the lowest total overhead. + */ static int -vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) +vdev_elevator_switch(vdev_t *v, char *elevator) { - struct block_device *bdev; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = vd->vd_bdev; + struct request_queue *q = bdev_get_queue(bdev); + char *device = bdev->bd_disk->disk_name; + int error; + + /* + * Skip devices which are not whole disks (partitions). + * Device-mapper devices are excepted since they may be whole + * disks despite the vdev_wholedisk flag, in which case we can + * and should switch the elevator. If the device-mapper device + * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the + * "Skip devices without schedulers" check below will fail. + */ + if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) + return (0); + + /* Skip devices without schedulers (loop, ram, dm, etc) */ + if (!q->elevator || !blk_queue_stackable(q)) + return (0); + + /* Leave existing scheduler when set to "none" */ + if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) + return (0); + +#ifdef HAVE_ELEVATOR_CHANGE + error = elevator_change(q, elevator); +#else + /* For pre-2.6.36 kernels elevator_change() is not available. + * Therefore we fall back to using a usermodehelper to echo the + * elevator into sysfs; This requires /bin/echo and sysfs to be + * mounted which may not be true early in the boot process. + */ +# define SET_SCHEDULER_CMD \ + "exec 0/sys/block/%s/queue/scheduler " \ + " 2>/dev/null; " \ + "echo %s" + + { + char *argv[] = { "/bin/sh", "-c", NULL, NULL }; + char *envp[] = { NULL }; + + argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + strfree(argv[2]); + } +#endif /* HAVE_ELEVATOR_CHANGE */ + if (error) + printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", + elevator, v->vdev_path, device, error); + + return (error); +} + +/* + * Expanding a whole disk vdev involves invoking BLKRRPART on the + * whole disk device. This poses a problem, because BLKRRPART will + * return EBUSY if one of the disk's partitions is open. That's why + * we have to do it here, just before opening the data partition. + * Unfortunately, BLKRRPART works by dropping all partitions and + * recreating them, which means that for a short time window, all + * /dev/sdxN device files disappear (until udev recreates them). + * This means two things: + * - When we open the data partition just after a BLKRRPART, we + * can't do it using the normal device file path because of the + * obvious race condition with udev. Instead, we use reliable + * kernel APIs to get a handle to the new partition device from + * the whole disk device. + * - Because vdev_disk_open() initially needs to find the device + * using its path, multiple vdev_disk_open() invocations in + * short succession on the same disk with BLKRRPARTs in the + * middle have a high probability of failure (because of the + * race condition with udev). A typical situation where this + * might happen is when the zpool userspace tool does a + * TRYIMPORT immediately followed by an IMPORT. For this + * reason, we only invoke BLKRRPART in the module when strictly + * necessary (zpool online -e case), and rely on userspace to + * do it when possible. + */ +static struct block_device * +vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) +{ +#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) + struct block_device *bdev, *result = ERR_PTR(-ENXIO); + struct gendisk *disk; + int error, partno; + + bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); + if (IS_ERR(bdev)) + return bdev; + + disk = get_gendisk(bdev->bd_dev, &partno); + vdev_bdev_close(bdev, vdev_bdev_mode(mode)); + + if (disk) { + bdev = bdget(disk_devt(disk)); + if (bdev) { + error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); + if (error == 0) + error = ioctl_by_bdev(bdev, BLKRRPART, 0); + vdev_bdev_close(bdev, vdev_bdev_mode(mode)); + } + + bdev = bdget_disk(disk, partno); + if (bdev) { + error = blkdev_get(bdev, + vdev_bdev_mode(mode) | FMODE_EXCL, vd); + if (error == 0) + result = bdev; + } + put_disk(disk); + } + + return result; +#else + return ERR_PTR(-EOPNOTSUPP); +#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ +} + +static int +vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + struct block_device *bdev = ERR_PTR(-ENXIO); vdev_disk_t *vd; int mode, block_size; @@ -115,7 +250,17 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) return EINVAL; } - vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP); + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (v->vdev_tsd != NULL) { + ASSERT(v->vdev_reopening); + vd = v->vdev_tsd; + goto skip_open; + } + + vd = kmem_zalloc(sizeof(vdev_disk_t), KM_PUSHPAGE); if (vd == NULL) return ENOMEM; @@ -123,18 +268,22 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path * then drives may be recabled without an issue. If the provided - * path is a udev by-path path then the physical location information + * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical * locations to maximize the systems tolerence to component failure. - * Alternately you can provide your own udev rule to flexibly map + * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the - * /dev/[hd]d devices which may be reorder due to probing order. + * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. */ mode = spa_mode(v->vdev_spa); - bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd); + if (v->vdev_wholedisk && v->vdev_expanding) + bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); + if (IS_ERR(bdev)) + bdev = vdev_bdev_open(v->vdev_path, + vdev_bdev_mode(mode), zfs_vdev_holder); if (IS_ERR(bdev)) { kmem_free(vd, sizeof(vdev_disk_t)); return -PTR_ERR(bdev); @@ -142,31 +291,26 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) v->vdev_tsd = vd; vd->vd_bdev = bdev; - block_size = vdev_bdev_block_size(bdev); - - /* We think the wholedisk property should always be set when this - * function is called. ASSERT here so if any legitimate cases exist - * where it's not set, we'll find them during debugging. If we never - * hit the ASSERT, this and the following conditional statement can be - * removed. */ - ASSERT3S(v->vdev_wholedisk, !=, -1ULL); - - /* The wholedisk property was initialized to -1 in vdev_alloc() if it - * was unspecified. In that case, check if this is a whole device. - * When bdev->bd_contains == bdev we have a whole device and not simply - * a partition. */ - if (v->vdev_wholedisk == -1ULL) - v->vdev_wholedisk = (bdev->bd_contains == bdev); + +skip_open: + /* Determine the physical block size */ + block_size = vdev_bdev_block_size(vd->vd_bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Physical volume size in bytes */ - *psize = bdev_capacity(bdev) * block_size; + *psize = bdev_capacity(vd->vd_bdev); + + /* TODO: report possible expansion size */ + *max_psize = *psize; /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + /* Try to set the io scheduler elevator algorithm */ + (void) vdev_elevator_switch(v, zfs_vdev_scheduler); + return 0; } @@ -175,7 +319,7 @@ vdev_disk_close(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; - if (vd == NULL) + if (v->vdev_reopening || vd == NULL) return; if (vd->vd_bdev != NULL) @@ -193,7 +337,7 @@ vdev_disk_dio_alloc(int bio_count) int i; dr = kmem_zalloc(sizeof(dio_request_t) + - sizeof(struct bio *) * bio_count, KM_SLEEP); + sizeof(struct bio *) * bio_count, KM_PUSHPAGE); if (dr) { init_completion(&dr->dr_comp); atomic_set(&dr->dr_ref, 0); @@ -220,6 +364,27 @@ vdev_disk_dio_free(dio_request_t *dr) sizeof(struct bio *) * dr->dr_bio_count); } +static int +vdev_disk_dio_is_sync(dio_request_t *dr) +{ +#ifdef HAVE_BIO_RW_SYNC + /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ + return (dr->dr_rw & (1 << BIO_RW_SYNC)); +#else +# ifdef HAVE_BIO_RW_SYNCIO + /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ + return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); +# else +# ifdef HAVE_REQ_SYNC + /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ + return (dr->dr_rw & REQ_SYNC); +# else +# error "Unable to determine bio sync flag" +# endif /* HAVE_REQ_SYNC */ +# endif /* HAVE_BIO_RW_SYNC */ +#endif /* HAVE_BIO_RW_SYNCIO */ +} + static void vdev_disk_dio_get(dio_request_t *dr) { @@ -242,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr) vdev_disk_dio_free(dr); if (zio) { - zio->io_delay = jiffies_to_msecs( - jiffies_64 - zio->io_delay); + zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) @@ -284,7 +448,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) rc = vdev_disk_dio_put(dr); /* Wake up synchronous waiter this is the last outstanding bio */ - if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO))) + if ((rc == 1) && vdev_disk_dio_is_sync(dr)) complete(&dr->dr_comp); BIO_END_IO_RETURN(0); @@ -337,7 +501,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, caddr_t bio_ptr; uint64_t bio_offset; int bio_size, bio_count = 16; - int i = 0, error = 0, block_size; + int i = 0, error = 0; + + ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -349,7 +515,6 @@ retry: dr->dr_zio = zio; dr->dr_rw = flags; - block_size = vdev_bdev_block_size(bdev); /* * When the IO size exceeds the maximum bio size for the request @@ -375,7 +540,6 @@ retry: if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); bio_count *= 2; - printk("WARNING: Resized bio's/dio to %d\n",bio_count); goto retry; } @@ -390,7 +554,7 @@ retry: vdev_disk_dio_get(dr); dr->dr_bio[i]->bi_bdev = bdev; - dr->dr_bio[i]->bi_sector = bio_offset / block_size; + dr->dr_bio[i]->bi_sector = bio_offset >> 9; dr->dr_bio[i]->bi_rw = dr->dr_rw; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; @@ -421,7 +585,7 @@ retry: * only synchronous consumer is vdev_disk_read_rootlabel() all other * IO originating from vdev_disk_io_start() is asynchronous. */ - if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) { + if (vdev_disk_dio_is_sync(dr)) { wait_for_completion(&dr->dr_comp); error = dr->dr_error; ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); @@ -440,13 +604,11 @@ vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags); } -/* 2.6.24 API change */ -#ifdef HAVE_BIO_EMPTY_BARRIER BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc) { zio_t *zio = bio->bi_private; - zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay); + zio->io_delay = jiffies_64 - zio->io_delay; zio->io_error = -rc; if (rc && (rc == -EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; @@ -478,17 +640,10 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) bio->bi_private = zio; bio->bi_bdev = bdev; zio->io_delay = jiffies_64; - submit_bio(WRITE_BARRIER, bio); + submit_bio(VDEV_WRITE_FLUSH_FUA, bio); return 0; } -#else -static int -vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) -{ - return ENOTSUP; -} -#endif /* HAVE_BIO_EMPTY_BARRIER */ static int vdev_disk_io_start(zio_t *zio) @@ -629,18 +784,18 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) uint64_t s, size; int i; - bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL); + bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder); if (IS_ERR(bdev)) return -PTR_ERR(bdev); - s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev); + s = bdev_capacity(bdev); if (s == 0) { vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); return EIO; } size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t); - label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP); + label = vmem_alloc(sizeof(vdev_label_t), KM_PUSHPAGE); for (i = 0; i < VDEV_LABELS; i++) { uint64_t offset, state, txg = 0; @@ -679,3 +834,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) return 0; } + +module_param(zfs_vdev_scheduler, charp, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");