X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzvol.c;h=c6cfac26d4afbb2e42576f87f77023abc32e8485;hb=36f86f73f68548f46eb3229c8adf583d59fa9988;hp=9dd9547e24b96f05261fefa5827a03acbf6ccfee;hpb=ce90208cf9e04df966429f115d8831371ea9afce;p=zfs.git

diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 9dd9547..c6cfac2 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -46,8 +46,10 @@
 #include <sys/zvol.h>
 #include <linux/blkdev_compat.h>
 
+unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
 unsigned int zvol_threads = 32;
+unsigned long zvol_max_discard_blocks = 16384;
 
 static taskq_t *zvol_taskq;
 static kmutex_t zvol_state_lock;
@@ -139,6 +141,29 @@ zvol_find_by_name(const char *name)
 	return NULL;
 }
 
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+boolean_t
+zvol_is_zvol(const char *device)
+{
+	struct block_device *bdev;
+	unsigned int major;
+
+	bdev = lookup_bdev(device);
+	if (IS_ERR(bdev))
+		return (B_FALSE);
+
+	major = MAJOR(bdev->bd_dev);
+	bdput(bdev);
+
+	if (major == zvol_major)
+            return (B_TRUE);
+
+	return (B_FALSE);
+}
+
 /*
  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  */
@@ -459,11 +484,15 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx,
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	boolean_t slogging;
+	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
-	slogging = spa_has_slogs(zilog->zl_spa);
+	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		? 0 : zvol_immediate_write_sz;
+	slogging = spa_has_slogs(zilog->zl_spa) &&
+		(zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
 	while (size) {
 		itx_t *itx;
@@ -475,7 +504,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx,
 		 * Unlike zfs_log_write() we can be called with
 		 * up to DMU_MAX_ACCESS/2 (5MB) writes.
 		 */
-		if (blocksize > zvol_immediate_write_sz && !slogging &&
+		if (blocksize > immediate_write_sz && !slogging &&
 		    size >= blocksize && offset % blocksize == 0) {
 			write_state = WR_INDIRECT; /* uses dmu_sync */
 			len = blocksize;
@@ -534,6 +563,14 @@ zvol_write(void *arg)
 	dmu_tx_t *tx;
 	rl_t *rl;
 
+	/*
+	 * Annotate this call path with a flag that indicates that it is
+	 * unsafe to use KM_SLEEP during memory allocations due to the
+	 * potential for a deadlock.  KM_PUSHPAGE should be used instead.
+	 */
+	ASSERT(!(current->flags & PF_NOFS));
+	current->flags |= PF_NOFS;
+
 	if (req->cmd_flags & VDEV_REQ_FLUSH)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
@@ -542,7 +579,7 @@ zvol_write(void *arg)
 	 */
 	if (size == 0) {
 		blk_end_request(req, 0, size);
-		return;
+		goto out;
 	}
 
 	rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
@@ -556,7 +593,7 @@ zvol_write(void *arg)
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		blk_end_request(req, -error, size);
-		return;
+		goto out;
 	}
 
 	error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
@@ -572,6 +609,8 @@ zvol_write(void *arg)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	blk_end_request(req, -error, size);
+out:
+	current->flags &= ~PF_NOFS;
 }
 
 #ifdef HAVE_BLK_QUEUE_DISCARD
@@ -581,24 +620,41 @@ zvol_discard(void *arg)
 	struct request *req = (struct request *)arg;
 	struct request_queue *q = req->q;
 	zvol_state_t *zv = q->queuedata;
-	uint64_t offset = blk_rq_pos(req) << 9;
-	uint64_t size = blk_rq_bytes(req);
+	uint64_t start = blk_rq_pos(req) << 9;
+	uint64_t end = start + blk_rq_bytes(req);
 	int error;
 	rl_t *rl;
 
-	if (offset + size > zv->zv_volsize) {
-		blk_end_request(req, -EIO, size);
-		return;
+	/*
+	 * Annotate this call path with a flag that indicates that it is
+	 * unsafe to use KM_SLEEP during memory allocations due to the
+	 * potential for a deadlock.  KM_PUSHPAGE should be used instead.
+	 */
+	ASSERT(!(current->flags & PF_NOFS));
+	current->flags |= PF_NOFS;
+
+	if (end > zv->zv_volsize) {
+		blk_end_request(req, -EIO, blk_rq_bytes(req));
+		goto out;
 	}
 
-	if (size == 0) {
-		blk_end_request(req, 0, size);
-		return;
+	/*
+	 * Align the request to volume block boundaries. If we don't,
+	 * then this will force dnode_free_range() to zero out the
+	 * unaligned parts, which is slow (read-modify-write) and
+	 * useless since we are not freeing any space by doing so.
+	 */
+	start = P2ROUNDUP(start, zv->zv_volblocksize);
+	end = P2ALIGN(end, zv->zv_volblocksize);
+
+	if (start >= end) {
+		blk_end_request(req, 0, blk_rq_bytes(req));
+		goto out;
 	}
 
-	rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
+	rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);
 
-	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, size);
+	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start);
 
 	/*
 	 * TODO: maybe we should add the operation to the log.
@@ -606,7 +662,9 @@ zvol_discard(void *arg)
 
 	zfs_range_unlock(rl);
 
-	blk_end_request(req, -error, size);
+	blk_end_request(req, -error, blk_rq_bytes(req));
+out:
+	current->flags &= ~PF_NOFS;
 }
 #endif /* HAVE_BLK_QUEUE_DISCARD */
 
@@ -759,7 +817,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	ASSERT(zio != NULL);
 	ASSERT(size != 0);
 
-	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE);
 	zgd->zgd_zilog = zv->zv_zilog;
 	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
@@ -833,25 +891,49 @@ zvol_first_open(zvol_state_t *zv)
 {
 	objset_t *os;
 	uint64_t volsize;
+	int locked = 0;
 	int error;
 	uint64_t ro;
 
+	/*
+	 * In all other cases the spa_namespace_lock is taken before the
+	 * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
+	 * function calls fops->open() with the bdev->bd_mutex lock held.
+	 *
+	 * To avoid a potential lock inversion deadlock we preemptively
+	 * try to take the spa_namespace_lock().  Normally it will not
+	 * be contended and this is safe because spa_open_common() handles
+	 * the case where the caller already holds the spa_namespace_lock.
+	 *
+	 * When it is contended we risk a lock inversion if we were to
+	 * block waiting for the lock.  Luckily, the __blkdev_get()
+	 * function allows us to return -ERESTARTSYS which will result in
+	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+	 * called again.  This process can be repeated safely until both
+	 * locks are acquired.
+	 */
+	if (!mutex_owned(&spa_namespace_lock)) {
+		locked = mutex_tryenter(&spa_namespace_lock);
+		if (!locked)
+			return (-ERESTARTSYS);
+	}
+
 	/* lie and say we're read-only */
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
 	if (error)
-		return (-error);
+		goto out_mutex;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error) {
-	        dmu_objset_disown(os, zvol_tag);
-	        return (-error);
+		dmu_objset_disown(os, zvol_tag);
+		goto out_mutex;
 	}
 
 	zv->zv_objset = os;
 	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 	if (error) {
-	        dmu_objset_disown(os, zvol_tag);
-	        return (-error);
+		dmu_objset_disown(os, zvol_tag);
+		goto out_mutex;
 	}
 
 	set_capacity(zv->zv_disk, volsize >> 9);
@@ -860,13 +942,17 @@ zvol_first_open(zvol_state_t *zv)
 
 	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0);
 	if (ro || dmu_objset_is_snapshot(os)) {
-                set_disk_ro(zv->zv_disk, 1);
-	        zv->zv_flags |= ZVOL_RDONLY;
+		set_disk_ro(zv->zv_disk, 1);
+		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
-                set_disk_ro(zv->zv_disk, 0);
-	        zv->zv_flags &= ~ZVOL_RDONLY;
+		set_disk_ro(zv->zv_disk, 0);
+		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 
+out_mutex:
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+
 	return (-error);
 }
 
@@ -875,8 +961,18 @@ zvol_last_close(zvol_state_t *zv)
 {
 	zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
+
 	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 	zv->zv_dbuf = NULL;
+
+	/*
+	 * Evict cached data
+	 */
+	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
+	    !(zv->zv_flags & ZVOL_RDONLY))
+		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+	(void) dmu_objset_evict_dbufs(zv->zv_objset);
+
 	dmu_objset_disown(zv->zv_objset, zvol_tag);
 	zv->zv_objset = NULL;
 }
@@ -1039,7 +1135,7 @@ zvol_probe(dev_t dev, int *part, void *arg)
 
 	mutex_enter(&zvol_state_lock);
 	zv = zvol_find_by_dev(dev);
-	kobj = zv ? get_disk(zv->zv_disk) : ERR_PTR(-ENOENT);
+	kobj = zv ? get_disk(zv->zv_disk) : NULL;
 	mutex_exit(&zvol_state_lock);
 
 	return kobj;
@@ -1114,6 +1210,7 @@ static zvol_state_t *
 zvol_alloc(dev_t dev, const char *name)
 {
 	zvol_state_t *zv;
+	int error = 0;
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 	if (zv == NULL)
@@ -1123,6 +1220,15 @@ zvol_alloc(dev_t dev, const char *name)
 	if (zv->zv_queue == NULL)
 		goto out_kmem;
 
+#ifdef HAVE_ELEVATOR_CHANGE
+	error = elevator_change(zv->zv_queue, "noop");
+#endif /* HAVE_ELEVATOR_CHANGE */
+	if (error) {
+		printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
+		    "noop", name, error);
+		goto out_queue;
+	}
+
 #ifdef HAVE_BLK_QUEUE_FLUSH
 	blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
 #else
@@ -1237,7 +1343,9 @@ __zvol_create_minor(const char *name)
 	blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
 	blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
 #ifdef HAVE_BLK_QUEUE_DISCARD
-	blk_queue_max_discard_sectors(zv->zv_queue, UINT_MAX);
+	blk_queue_max_discard_sectors(zv->zv_queue,
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+	blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
 #endif
 #ifdef HAVE_BLK_QUEUE_NONROT
@@ -1249,9 +1357,9 @@ __zvol_create_minor(const char *name)
 	else
 		zil_replay(os, zv, zvol_replay_vector);
 
+	zv->zv_objset = NULL;
 out_dmu_objset_disown:
 	dmu_objset_disown(os, zvol_tag);
-	zv->zv_objset = NULL;
 out_doi:
 	kmem_free(doi, sizeof(dmu_object_info_t));
 out:
@@ -1337,6 +1445,9 @@ zvol_create_minors(const char *pool)
 	spa_t *spa = NULL;
 	int error = 0;
 
+	if (zvol_inhibit_dev)
+		return (0);
+
 	mutex_enter(&zvol_state_lock);
 	if (pool) {
 		error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb,
@@ -1366,6 +1477,9 @@ zvol_remove_minors(const char *pool)
 	zvol_state_t *zv, *zv_next;
 	char *str;
 
+	if (zvol_inhibit_dev)
+		return;
+
 	str = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
 	if (pool) {
 		(void) strncpy(str, pool, strlen(pool));
@@ -1390,14 +1504,8 @@ zvol_init(void)
 {
 	int error;
 
-	/*
-	 * The zvol taskqs are created with TASKQ_NORECLAIM so they may be
-	 * used safely as a swap device.  If direct reclaim is allowed then
-	 * they quickly deadlock in one of the internal memory allocations.
-	 */
 	zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri,
-				  zvol_threads, INT_MAX,
-				  TASKQ_PREPOPULATE | TASKQ_NORECLAIM);
+		                  zvol_threads, INT_MAX, TASKQ_PREPOPULATE);
 	if (zvol_taskq == NULL) {
 		printk(KERN_INFO "ZFS: taskq_create() failed\n");
 		return (-ENOMEM);
@@ -1433,8 +1541,14 @@ zvol_fini(void)
 	list_destroy(&zvol_state_list);
 }
 
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
 module_param(zvol_threads, uint, 0444);
 MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once");