X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_disk.c;h=31d1a28bdc9bfff35392cbd35e9a69915bd2447c;hb=cc92e9d0c3e67a7e66c844466f85696a087bf60a;hp=51062ef23529d3489e954b7a9d1f270603b37803;hpb=a69052be7f9a4008e2b09578e9db5fdebc186111;p=zfs.git

diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 51062ef..31d1a28 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -33,6 +33,9 @@
 #include <sys/zio.h>
 #include <sys/sunldi.h>
 
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+static void *zfs_vdev_holder = VDEV_HOLDER;
+
 /*
  * Virtual device vector for disks.
  */
@@ -85,10 +88,10 @@ bdev_capacity(struct block_device *bdev)
 
 	/* The partition capacity referenced by the block device */
 	if (part)
-	       return part->nr_sects;
+		return (part->nr_sects << 9);
 
 	/* Otherwise assume the full device capacity */
-	return get_capacity(bdev->bd_disk);
+	return (get_capacity(bdev->bd_disk) << 9);
 }
 
 static void
@@ -102,10 +105,142 @@ vdev_disk_error(zio_t *zio)
 #endif
 }
 
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices.  This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization.  While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device.  This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ */
 static int
-vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
+vdev_elevator_switch(vdev_t *v, char *elevator)
 {
-	struct block_device *bdev;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = vd->vd_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	char *device = bdev->bd_disk->disk_name;
+	int error;
+
+	/*
+	 * Skip devices which are not whole disks (partitions).
+	 * Device-mapper devices are excepted since they may be whole
+	 * disks despite the vdev_wholedisk flag, in which case we can
+	 * and should switch the elevator. If the device-mapper device
+	 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
+	 * "Skip devices without schedulers" check below will fail.
+	 */
+	if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
+		return (0);
+
+	/* Skip devices without schedulers (loop, ram, dm, etc) */
+	if (!q->elevator || !blk_queue_stackable(q))
+		return (0);
+
+	/* Leave existing scheduler when set to "none" */
+	if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
+		return (0);
+
+#ifdef HAVE_ELEVATOR_CHANGE
+	error = elevator_change(q, elevator);
+#else
+	/* For pre-2.6.36 kernels elevator_change() is not available.
+	 * Therefore we fall back to using a usermodehelper to echo the
+	 * elevator into sysfs;  This requires /bin/echo and sysfs to be
+	 * mounted which may not be true early in the boot process.
+	 */
+# define SET_SCHEDULER_CMD \
+	"exec 0</dev/null " \
+	"     1>/sys/block/%s/queue/scheduler " \
+	"     2>/dev/null; " \
+	"echo %s"
+
+	{
+		char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+		char *envp[] = { NULL };
+
+		argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+		error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+		strfree(argv[2]);
+	}
+#endif /* HAVE_ELEVATOR_CHANGE */
+	if (error)
+		printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+		       elevator, v->vdev_path, device, error);
+
+	return (error);
+}
+
+/*
+ * Expanding a whole disk vdev involves invoking BLKRRPART on the
+ * whole disk device. This poses a problem, because BLKRRPART will
+ * return EBUSY if one of the disk's partitions is open. That's why
+ * we have to do it here, just before opening the data partition.
+ * Unfortunately, BLKRRPART works by dropping all partitions and
+ * recreating them, which means that for a short time window, all
+ * /dev/sdxN device files disappear (until udev recreates them).
+ * This means two things:
+ *  - When we open the data partition just after a BLKRRPART, we
+ *    can't do it using the normal device file path because of the
+ *    obvious race condition with udev. Instead, we use reliable
+ *    kernel APIs to get a handle to the new partition device from
+ *    the whole disk device.
+ *  - Because vdev_disk_open() initially needs to find the device
+ *    using its path, multiple vdev_disk_open() invocations in
+ *    short succession on the same disk with BLKRRPARTs in the
+ *    middle have a high probability of failure (because of the
+ *    race condition with udev). A typical situation where this
+ *    might happen is when the zpool userspace tool does a
+ *    TRYIMPORT immediately followed by an IMPORT. For this
+ *    reason, we only invoke BLKRRPART in the module when strictly
+ *    necessary (zpool online -e case), and rely on userspace to
+ *    do it when possible.
+ */
+static struct block_device *
+vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
+{
+#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
+	struct block_device *bdev, *result = ERR_PTR(-ENXIO);
+	struct gendisk *disk;
+	int error, partno;
+
+	bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	vdev_bdev_close(bdev, vdev_bdev_mode(mode));
+
+	if (disk) {
+		bdev = bdget(disk_devt(disk));
+		if (bdev) {
+			error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
+			if (error == 0)
+				error = ioctl_by_bdev(bdev, BLKRRPART, 0);
+			vdev_bdev_close(bdev, vdev_bdev_mode(mode));
+		}
+
+		bdev = bdget_disk(disk, partno);
+		if (bdev) {
+			error = blkdev_get(bdev,
+			    vdev_bdev_mode(mode) | FMODE_EXCL, vd);
+			if (error == 0)
+				result = bdev;
+		}
+		put_disk(disk);
+	}
+
+	return result;
+#else
+	return ERR_PTR(-EOPNOTSUPP);
+#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *ashift)
+{
+	struct block_device *bdev = ERR_PTR(-ENXIO);
 	vdev_disk_t *vd;
 	int mode, block_size;
 
@@ -115,7 +250,17 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 		return EINVAL;
 	}
 
-	vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
+	/*
+	 * Reopen the device if it's not currently open. Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (v->vdev_tsd != NULL) {
+		ASSERT(v->vdev_reopening);
+		vd = v->vdev_tsd;
+		goto skip_open;
+	}
+
+	vd = kmem_zalloc(sizeof(vdev_disk_t), KM_PUSHPAGE);
 	if (vd == NULL)
 		return ENOMEM;
 
@@ -123,18 +268,22 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
 	 * then drives may be recabled without an issue.  If the provided
-	 * path is a udev by-path path then the physical location information
+	 * path is a udev by-path path, then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
 	 * locations to maximize the systems tolerence to component failure.
-	 * Alternately you can provide your own udev rule to flexibly map
+	 * Alternatively, you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
-	 * /dev/[hd]d devices which may be reorder due to probing order.
+	 * /dev/[hd]d devices which may be reordered due to probing order.
 	 * Devices in the wrong locations will be detected by the higher
 	 * level vdev validation.
 	 */
 	mode = spa_mode(v->vdev_spa);
-	bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
+	if (v->vdev_wholedisk && v->vdev_expanding)
+		bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
+	if (IS_ERR(bdev))
+		bdev = vdev_bdev_open(v->vdev_path,
+		    vdev_bdev_mode(mode), zfs_vdev_holder);
 	if (IS_ERR(bdev)) {
 		kmem_free(vd, sizeof(vdev_disk_t));
 		return -PTR_ERR(bdev);
@@ -142,31 +291,26 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 
 	v->vdev_tsd = vd;
 	vd->vd_bdev = bdev;
-	block_size =  vdev_bdev_block_size(bdev);
-
-	/* We think the wholedisk property should always be set when this
-	 * function is called.  ASSERT here so if any legitimate cases exist
-	 * where it's not set, we'll find them during debugging.  If we never
-	 * hit the ASSERT, this and the following conditional statement can be
-	 * removed. */
-	ASSERT3S(v->vdev_wholedisk, !=, -1ULL);
-
-	/* The wholedisk property was initialized to -1 in vdev_alloc() if it
-	 * was unspecified.  In that case, check if this is a whole device.
-	 * When bdev->bd_contains == bdev we have a whole device and not simply
-	 * a partition. */
-	if (v->vdev_wholedisk == -1ULL)
-		v->vdev_wholedisk = (bdev->bd_contains == bdev);
+
+skip_open:
+	/*  Determine the physical block size */
+	block_size = vdev_bdev_block_size(vd->vd_bdev);
 
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	v->vdev_nowritecache = B_FALSE;
 
 	/* Physical volume size in bytes */
-	*psize = bdev_capacity(bdev) * block_size;
+	*psize = bdev_capacity(vd->vd_bdev);
+
+	/* TODO: report possible expansion size */
+	*max_psize = *psize;
 
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
 
+	/* Try to set the io scheduler elevator algorithm */
+	(void) vdev_elevator_switch(v, zfs_vdev_scheduler);
+
 	return 0;
 }
 
@@ -175,7 +319,7 @@ vdev_disk_close(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 
-	if (vd == NULL)
+	if (v->vdev_reopening || vd == NULL)
 		return;
 
 	if (vd->vd_bdev != NULL)
@@ -193,7 +337,7 @@ vdev_disk_dio_alloc(int bio_count)
 	int i;
 
 	dr = kmem_zalloc(sizeof(dio_request_t) +
-	                 sizeof(struct bio *) * bio_count, KM_SLEEP);
+	                 sizeof(struct bio *) * bio_count, KM_PUSHPAGE);
 	if (dr) {
 		init_completion(&dr->dr_comp);
 		atomic_set(&dr->dr_ref, 0);
@@ -220,6 +364,27 @@ vdev_disk_dio_free(dio_request_t *dr)
 	          sizeof(struct bio *) * dr->dr_bio_count);
 }
 
+static int
+vdev_disk_dio_is_sync(dio_request_t *dr)
+{
+#ifdef HAVE_BIO_RW_SYNC
+	/* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */
+        return (dr->dr_rw & (1 << BIO_RW_SYNC));
+#else
+# ifdef HAVE_BIO_RW_SYNCIO
+	/* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */
+        return (dr->dr_rw & (1 << BIO_RW_SYNCIO));
+# else
+#  ifdef HAVE_REQ_SYNC
+	/* REQ_SYNC preferred interface from 2.6.36-2.6.xx */
+        return (dr->dr_rw & REQ_SYNC);
+#  else
+#   error "Unable to determine bio sync flag"
+#  endif /* HAVE_REQ_SYNC */
+# endif /* HAVE_BIO_RW_SYNC */
+#endif /* HAVE_BIO_RW_SYNCIO */
+}
+
 static void
 vdev_disk_dio_get(dio_request_t *dr)
 {
@@ -242,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr)
 		vdev_disk_dio_free(dr);
 
 		if (zio) {
-			zio->io_delay = jiffies_to_msecs(
-			    jiffies_64 - zio->io_delay);
+			zio->io_delay = jiffies_64 - zio->io_delay;
 			zio->io_error = error;
 			ASSERT3S(zio->io_error, >=, 0);
 			if (zio->io_error)
@@ -284,7 +448,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error)
 	rc = vdev_disk_dio_put(dr);
 
 	/* Wake up synchronous waiter this is the last outstanding bio */
-	if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO)))
+	if ((rc == 1) && vdev_disk_dio_is_sync(dr))
 		complete(&dr->dr_comp);
 
 	BIO_END_IO_RETURN(0);
@@ -337,7 +501,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
 	caddr_t bio_ptr;
 	uint64_t bio_offset;
 	int bio_size, bio_count = 16;
-	int i = 0, error = 0, block_size;
+	int i = 0, error = 0;
+
+	ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
 
 retry:
 	dr = vdev_disk_dio_alloc(bio_count);
@@ -349,7 +515,6 @@ retry:
 
 	dr->dr_zio = zio;
 	dr->dr_rw = flags;
-	block_size = vdev_bdev_block_size(bdev);
 
 	/*
 	 * When the IO size exceeds the maximum bio size for the request
@@ -375,7 +540,6 @@ retry:
 		if (dr->dr_bio_count == i) {
 			vdev_disk_dio_free(dr);
 			bio_count *= 2;
-			printk("WARNING: Resized bio's/dio to %d\n",bio_count);
 			goto retry;
 		}
 
@@ -390,7 +554,7 @@ retry:
 		vdev_disk_dio_get(dr);
 
 		dr->dr_bio[i]->bi_bdev = bdev;
-		dr->dr_bio[i]->bi_sector = bio_offset / block_size;
+		dr->dr_bio[i]->bi_sector = bio_offset >> 9;
 		dr->dr_bio[i]->bi_rw = dr->dr_rw;
 		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
@@ -421,7 +585,7 @@ retry:
 	 * only synchronous consumer is vdev_disk_read_rootlabel() all other
 	 * IO originating from vdev_disk_io_start() is asynchronous.
 	 */
-	if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
+	if (vdev_disk_dio_is_sync(dr)) {
 		wait_for_completion(&dr->dr_comp);
 		error = dr->dr_error;
 		ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
@@ -440,13 +604,11 @@ vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
 	return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
 }
 
-/* 2.6.24 API change */
-#ifdef HAVE_BIO_EMPTY_BARRIER
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
 {
 	zio_t *zio = bio->bi_private;
 
-	zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
+	zio->io_delay = jiffies_64 - zio->io_delay;
 	zio->io_error = -rc;
 	if (rc && (rc == -EOPNOTSUPP))
 		zio->io_vd->vdev_nowritecache = B_TRUE;
@@ -478,17 +640,10 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	bio->bi_private = zio;
 	bio->bi_bdev = bdev;
 	zio->io_delay = jiffies_64;
-	submit_bio(WRITE_BARRIER, bio);
+	submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
 
 	return 0;
 }
-#else
-static int
-vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
-{
-	return ENOTSUP;
-}
-#endif /* HAVE_BIO_EMPTY_BARRIER */
 
 static int
 vdev_disk_io_start(zio_t *zio)
@@ -629,18 +784,18 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 	uint64_t s, size;
 	int i;
 
-	bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL);
+	bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder);
 	if (IS_ERR(bdev))
 		return -PTR_ERR(bdev);
 
-	s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev);
+	s = bdev_capacity(bdev);
 	if (s == 0) {
 		vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
 		return EIO;
 	}
 
 	size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t);
-	label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP);
+	label = vmem_alloc(sizeof(vdev_label_t), KM_PUSHPAGE);
 
 	for (i = 0; i < VDEV_LABELS; i++) {
 	        uint64_t offset, state, txg = 0;
@@ -679,3 +834,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 
 	return 0;
 }
+
+module_param(zfs_vdev_scheduler, charp, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");