Cleanup zvol initialization code
[zfs.git] / module / zfs / zvol.c
index 6e92942..97b65c8 100644 (file)
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_znode.h>
 #include <sys/zvol.h>
+#include <linux/blkdev_compat.h>
 
+unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_major = ZVOL_MAJOR;
-unsigned int zvol_threads = 0;
+unsigned int zvol_threads = 32;
+unsigned long zvol_max_discard_blocks = 16384;
 
 static taskq_t *zvol_taskq;
 static kmutex_t zvol_state_lock;
@@ -57,7 +60,7 @@ static char *zvol_tag = "zvol_tag";
  * The in-core state of each volume.
  */
 typedef struct zvol_state {
-       char                    zv_name[DISK_NAME_LEN]; /* name */
+       char                    zv_name[MAXNAMELEN];    /* name */
        uint64_t                zv_volsize;     /* advertised space */
        uint64_t                zv_volblocksize;/* volume block size */
        objset_t                *zv_objset;     /* objset handle */
@@ -131,13 +134,36 @@ zvol_find_by_name(const char *name)
        ASSERT(MUTEX_HELD(&zvol_state_lock));
        for (zv = list_head(&zvol_state_list); zv != NULL;
             zv = list_next(&zvol_state_list, zv)) {
-               if (!strncmp(zv->zv_name, name, DISK_NAME_LEN))
+               if (!strncmp(zv->zv_name, name, MAXNAMELEN))
                        return zv;
        }
 
        return NULL;
 }
 
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+boolean_t
+zvol_is_zvol(const char *device)
+{
+       struct block_device *bdev;
+       unsigned int major;
+
+       bdev = lookup_bdev(device);
+       if (IS_ERR(bdev))
+               return (B_FALSE);
+
+       major = MAJOR(bdev->bd_dev);
+       bdput(bdev);
+
+       if (major == zvol_major)
+            return (B_TRUE);
+
+       return (B_FALSE);
+}
+
 /*
  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  */
@@ -227,7 +253,7 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
  * Ensure the zap is flushed then inform the VFS of the capacity change.
  */
 static int
-zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os)
 {
        struct block_device *bdev;
        dmu_tx_t *tx;
@@ -235,7 +261,7 @@ zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
 
        ASSERT(MUTEX_HELD(&zvol_state_lock));
 
-       tx = dmu_tx_create(zv->zv_objset);
+       tx = dmu_tx_create(os);
        dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
        error = dmu_tx_assign(tx, TXG_WAIT);
        if (error) {
@@ -243,27 +269,35 @@ zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
                return (error);
        }
 
-       error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+       error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
            &volsize, tx);
        dmu_tx_commit(tx);
 
        if (error)
                return (error);
 
-       error = dmu_free_long_range(zv->zv_objset,
+       error = dmu_free_long_range(os,
            ZVOL_OBJ, volsize, DMU_OBJECT_END);
        if (error)
                return (error);
 
-       zv->zv_volsize = volsize;
-       zv->zv_changed = 1;
-
        bdev = bdget_disk(zv->zv_disk, 0);
        if (!bdev)
-               return EIO;
+               return (EIO);
+/*
+ * 2.6.28 API change
+ * Added check_disk_size_change() helper function.
+ */
+#ifdef HAVE_CHECK_DISK_SIZE_CHANGE
+       set_capacity(zv->zv_disk, volsize >> 9);
+       zv->zv_volsize = volsize;
+       check_disk_size_change(zv->zv_disk, bdev);
+#else
+       zv->zv_volsize = volsize;
+       zv->zv_changed = 1;
+       (void) check_disk_change(bdev);
+#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */
 
-       error = check_disk_change(bdev);
-       ASSERT3U(error, !=, 0);
        bdput(bdev);
 
        return (0);
@@ -310,7 +344,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
                goto out_doi;
        }
 
-       error = zvol_update_volsize(zv, volsize);
+       error = zvol_update_volsize(zv, volsize, os);
 out_doi:
        kmem_free(doi, sizeof(dmu_object_info_t));
 out:
@@ -419,20 +453,20 @@ zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
  * Callback vectors for replaying records.
  * Only TX_WRITE is needed for zvol.
  */
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
-       (zil_replay_func_t *)zvol_replay_err,   /* no such transaction type */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_CREATE */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_MKDIR */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_MKXATTR */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_SYMLINK */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_REMOVE */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_RMDIR */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_LINK */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_RENAME */
-       (zil_replay_func_t *)zvol_replay_write, /* TX_WRITE */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_TRUNCATE */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_SETATTR */
-       (zil_replay_func_t *)zvol_replay_err,   /* TX_ACL */
+zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = {
+       (zil_replay_func_t)zvol_replay_err,     /* no such transaction type */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_CREATE */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_MKDIR */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_MKXATTR */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_SYMLINK */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_REMOVE */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_RMDIR */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_LINK */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_RENAME */
+       (zil_replay_func_t)zvol_replay_write,   /* TX_WRITE */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_TRUNCATE */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_SETATTR */
+       (zil_replay_func_t)zvol_replay_err,     /* TX_ACL */
 };
 
 /*
@@ -450,11 +484,15 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx,
        uint32_t blocksize = zv->zv_volblocksize;
        zilog_t *zilog = zv->zv_zilog;
        boolean_t slogging;
+       ssize_t immediate_write_sz;
 
        if (zil_replaying(zilog, tx))
                return;
 
-       slogging = spa_has_slogs(zilog->zl_spa);
+       immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+               ? 0 : zvol_immediate_write_sz;
+       slogging = spa_has_slogs(zilog->zl_spa) &&
+               (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
        while (size) {
                itx_t *itx;
@@ -466,7 +504,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx,
                 * Unlike zfs_log_write() we can be called with
                 * up to DMU_MAX_ACCESS/2 (5MB) writes.
                 */
-               if (blocksize > zvol_immediate_write_sz && !slogging &&
+               if (blocksize > immediate_write_sz && !slogging &&
                    size >= blocksize && offset % blocksize == 0) {
                        write_state = WR_INDIRECT; /* uses dmu_sync */
                        len = blocksize;
@@ -525,6 +563,25 @@ zvol_write(void *arg)
        dmu_tx_t *tx;
        rl_t *rl;
 
+       /*
+        * Annotate this call path with a flag that indicates that it is
+        * unsafe to use KM_SLEEP during memory allocations due to the
+        * potential for a deadlock.  KM_PUSHPAGE should be used instead.
+        */
+       ASSERT(!(current->flags & PF_NOFS));
+       current->flags |= PF_NOFS;
+
+       if (req->cmd_flags & VDEV_REQ_FLUSH)
+               zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+       /*
+        * Some requests are just for flush and nothing else.
+        */
+       if (size == 0) {
+               blk_end_request(req, 0, size);
+               goto out;
+       }
+
        rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
 
        tx = dmu_tx_create(zv->zv_objset);
@@ -536,22 +593,81 @@ zvol_write(void *arg)
                dmu_tx_abort(tx);
                zfs_range_unlock(rl);
                blk_end_request(req, -error, size);
-               return;
+               goto out;
        }
 
        error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
        if (error == 0)
-               zvol_log_write(zv, tx, offset, size, rq_is_sync(req));
+               zvol_log_write(zv, tx, offset, size,
+                   req->cmd_flags & VDEV_REQ_FUA);
 
        dmu_tx_commit(tx);
        zfs_range_unlock(rl);
 
-       if (rq_is_sync(req))
+       if ((req->cmd_flags & VDEV_REQ_FUA) ||
+           zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
                zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
        blk_end_request(req, -error, size);
+out:
+       current->flags &= ~PF_NOFS;
 }
 
+#ifdef HAVE_BLK_QUEUE_DISCARD
+static void
+zvol_discard(void *arg)
+{
+       struct request *req = (struct request *)arg;
+       struct request_queue *q = req->q;
+       zvol_state_t *zv = q->queuedata;
+       uint64_t start = blk_rq_pos(req) << 9;
+       uint64_t end = start + blk_rq_bytes(req);
+       int error;
+       rl_t *rl;
+
+       /*
+        * Annotate this call path with a flag that indicates that it is
+        * unsafe to use KM_SLEEP during memory allocations due to the
+        * potential for a deadlock.  KM_PUSHPAGE should be used instead.
+        */
+       ASSERT(!(current->flags & PF_NOFS));
+       current->flags |= PF_NOFS;
+
+       if (end > zv->zv_volsize) {
+               blk_end_request(req, -EIO, blk_rq_bytes(req));
+               goto out;
+       }
+
+       /*
+        * Align the request to volume block boundaries. If we don't,
+        * then this will force dnode_free_range() to zero out the
+        * unaligned parts, which is slow (read-modify-write) and
+        * useless since we are not freeing any space by doing so.
+        */
+       start = P2ROUNDUP(start, zv->zv_volblocksize);
+       end = P2ALIGN(end, zv->zv_volblocksize);
+
+       if (start >= end) {
+               blk_end_request(req, 0, blk_rq_bytes(req));
+               goto out;
+       }
+
+       rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);
+
+       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start);
+
+       /*
+        * TODO: maybe we should add the operation to the log.
+        */
+
+       zfs_range_unlock(rl);
+
+       blk_end_request(req, -error, blk_rq_bytes(req));
+out:
+       current->flags &= ~PF_NOFS;
+}
+#endif /* HAVE_BLK_QUEUE_DISCARD */
+
 /*
  * Common read path running under the zvol taskq context.  This function
  * is responsible for copying the requested data out of the DMU and in to
@@ -569,6 +685,11 @@ zvol_read(void *arg)
        int error;
        rl_t *rl;
 
+       if (size == 0) {
+               blk_end_request(req, 0, size);
+               return;
+       }
+
        rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
        error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req);
@@ -618,7 +739,7 @@ zvol_request(struct request_queue *q)
        while ((req = blk_fetch_request(q)) != NULL) {
                size = blk_rq_bytes(req);
 
-               if (blk_rq_pos(req) + blk_rq_sectors(req) >
+               if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) >
                    get_capacity(zv->zv_disk)) {
                        printk(KERN_INFO
                               "%s: bad access: block=%llu, count=%lu\n",
@@ -647,6 +768,13 @@ zvol_request(struct request_queue *q)
                                break;
                        }
 
+#ifdef HAVE_BLK_QUEUE_DISCARD
+                       if (req->cmd_flags & VDEV_REQ_DISCARD) {
+                               zvol_dispatch(zvol_discard, req);
+                               break;
+                       }
+#endif /* HAVE_BLK_QUEUE_DISCARD */
+
                        zvol_dispatch(zvol_write, req);
                        break;
                default:
@@ -689,7 +817,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        ASSERT(zio != NULL);
        ASSERT(size != 0);
 
-       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE);
        zgd->zgd_zilog = zv->zv_zilog;
        zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
@@ -763,25 +891,49 @@ zvol_first_open(zvol_state_t *zv)
 {
        objset_t *os;
        uint64_t volsize;
+       int locked = 0;
        int error;
        uint64_t ro;
 
+       /*
+        * In all other cases the spa_namespace_lock is taken before the
+        * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
+        * function calls fops->open() with the bdev->bd_mutex lock held.
+        *
+        * To avoid a potential lock inversion deadlock we preemptively
+        * try to take the spa_namespace_lock().  Normally it will not
+        * be contended and this is safe because spa_open_common() handles
+        * the case where the caller already holds the spa_namespace_lock.
+        *
+        * When it is contended we risk a lock inversion if we were to
+        * block waiting for the lock.  Luckily, the __blkdev_get()
+        * function allows us to return -ERESTARTSYS which will result in
+        * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+        * called again.  This process can be repeated safely until both
+        * locks are acquired.
+        */
+       if (!mutex_owned(&spa_namespace_lock)) {
+               locked = mutex_tryenter(&spa_namespace_lock);
+               if (!locked)
+                       return (-ERESTARTSYS);
+       }
+
        /* lie and say we're read-only */
        error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
        if (error)
-               return (-error);
+               goto out_mutex;
 
        error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
        if (error) {
-               dmu_objset_disown(os, zvol_tag);
-               return (-error);
+               dmu_objset_disown(os, zvol_tag);
+               goto out_mutex;
        }
 
        zv->zv_objset = os;
        error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
        if (error) {
-               dmu_objset_disown(os, zvol_tag);
-               return (-error);
+               dmu_objset_disown(os, zvol_tag);
+               goto out_mutex;
        }
 
        set_capacity(zv->zv_disk, volsize >> 9);
@@ -789,14 +941,19 @@ zvol_first_open(zvol_state_t *zv)
        zv->zv_zilog = zil_open(os, zvol_get_data);
 
        VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0);
-       if (ro || dmu_objset_is_snapshot(os)) {
-                set_disk_ro(zv->zv_disk, 1);
-               zv->zv_flags |= ZVOL_RDONLY;
+       if (ro || dmu_objset_is_snapshot(os) ||
+           !spa_writeable(dmu_objset_spa(os))) {
+               set_disk_ro(zv->zv_disk, 1);
+               zv->zv_flags |= ZVOL_RDONLY;
        } else {
-                set_disk_ro(zv->zv_disk, 0);
-               zv->zv_flags &= ~ZVOL_RDONLY;
+               set_disk_ro(zv->zv_disk, 0);
+               zv->zv_flags &= ~ZVOL_RDONLY;
        }
 
+out_mutex:
+       if (locked)
+               mutex_exit(&spa_namespace_lock);
+
        return (-error);
 }
 
@@ -805,8 +962,18 @@ zvol_last_close(zvol_state_t *zv)
 {
        zil_close(zv->zv_zilog);
        zv->zv_zilog = NULL;
+
        dmu_buf_rele(zv->zv_dbuf, zvol_tag);
        zv->zv_dbuf = NULL;
+
+       /*
+        * Evict cached data
+        */
+       if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
+           !(zv->zv_flags & ZVOL_RDONLY))
+               txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+       (void) dmu_objset_evict_dbufs(zv->zv_objset);
+
        dmu_objset_disown(zv->zv_objset, zvol_tag);
        zv->zv_objset = NULL;
 }
@@ -894,6 +1061,9 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
        case BLKFLSBUF:
                zil_commit(zv->zv_zilog, ZVOL_OBJ);
                break;
+       case BLKZNAME:
+               error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+               break;
 
        default:
                error = -ENOTTY;
@@ -966,7 +1136,7 @@ zvol_probe(dev_t dev, int *part, void *arg)
 
        mutex_enter(&zvol_state_lock);
        zv = zvol_find_by_dev(dev);
-       kobj = zv ? get_disk(zv->zv_disk) : ERR_PTR(-ENOENT);
+       kobj = zv ? get_disk(zv->zv_disk) : NULL;
        mutex_exit(&zvol_state_lock);
 
        return kobj;
@@ -1002,6 +1172,8 @@ static int
 zvol_ioctl_by_inode(struct inode *inode, struct file *file,
                     unsigned int cmd, unsigned long arg)
 {
+       if (file == NULL || inode == NULL)
+               return -EINVAL;
        return zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg);
 }
 
@@ -1010,6 +1182,8 @@ static long
 zvol_compat_ioctl_by_inode(struct file *file,
                            unsigned int cmd, unsigned long arg)
 {
+       if (file == NULL)
+               return -EINVAL;
        return zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
                                 file->f_mode, cmd, arg);
 }
@@ -1037,15 +1211,32 @@ static zvol_state_t *
 zvol_alloc(dev_t dev, const char *name)
 {
        zvol_state_t *zv;
+       int error = 0;
 
        zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
-       if (zv == NULL)
-               goto out;
+
+       spin_lock_init(&zv->zv_lock);
+       list_link_init(&zv->zv_next);
 
        zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock);
        if (zv->zv_queue == NULL)
                goto out_kmem;
 
+#ifdef HAVE_ELEVATOR_CHANGE
+       error = elevator_change(zv->zv_queue, "noop");
+#endif /* HAVE_ELEVATOR_CHANGE */
+       if (error) {
+               printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
+                   "noop", name, error);
+               goto out_queue;
+       }
+
+#ifdef HAVE_BLK_QUEUE_FLUSH
+       blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
+#else
+       blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL);
+#endif /* HAVE_BLK_QUEUE_FLUSH */
+
        zv->zv_disk = alloc_disk(ZVOL_MINORS);
        if (zv->zv_disk == NULL)
                goto out_queue;
@@ -1053,20 +1244,20 @@ zvol_alloc(dev_t dev, const char *name)
        zv->zv_queue->queuedata = zv;
        zv->zv_dev = dev;
        zv->zv_open_count = 0;
-       strlcpy(zv->zv_name, name, DISK_NAME_LEN);
+       strlcpy(zv->zv_name, name, MAXNAMELEN);
 
        mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
        avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
            sizeof (rl_t), offsetof(rl_t, r_node));
-       spin_lock_init(&zv->zv_lock);
-       list_link_init(&zv->zv_next);
+       zv->zv_znode.z_is_zvol = TRUE;
 
        zv->zv_disk->major = zvol_major;
        zv->zv_disk->first_minor = (dev & MINORMASK);
        zv->zv_disk->fops = &zvol_ops;
        zv->zv_disk->private_data = zv;
        zv->zv_disk->queue = zv->zv_queue;
-       snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s", name);
+       snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
+           ZVOL_DEV_NAME, (dev & MINORMASK));
 
        return zv;
 
@@ -1074,7 +1265,7 @@ out_queue:
        blk_cleanup_queue(zv->zv_queue);
 out_kmem:
        kmem_free(zv, sizeof (zvol_state_t));
-out:
+
        return NULL;
 }
 
@@ -1095,7 +1286,28 @@ zvol_free(zvol_state_t *zv)
 }
 
 static int
-__zvol_create_minor(const char *name)
+__zvol_snapdev_hidden(const char *name)
+{
+        uint64_t snapdev;
+        char *parent;
+        char *atp;
+        int error = 0;
+
+        parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+        (void) strlcpy(parent, name, MAXPATHLEN);
+
+        if ((atp = strrchr(parent, '@')) != NULL) {
+                *atp = '\0';
+                error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
+                if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
+                        error = ENODEV;
+        }
+        kmem_free(parent, MAXPATHLEN);
+        return (error);
+}
+
+static int
+__zvol_create_minor(const char *name, boolean_t ignore_snapdev)
 {
        zvol_state_t *zv;
        objset_t *os;
@@ -1112,6 +1324,12 @@ __zvol_create_minor(const char *name)
                goto out;
        }
 
+       if (ignore_snapdev == B_FALSE) {
+               error = __zvol_snapdev_hidden(name);
+               if (error)
+                       goto out;
+       }
+
        doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP);
 
        error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
@@ -1145,14 +1363,31 @@ __zvol_create_minor(const char *name)
 
        set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
 
-       if (zil_replay_disable)
-               zil_destroy(dmu_objset_zil(os), B_FALSE);
-       else
-               zil_replay(os, zv, zvol_replay_vector);
+       blk_queue_max_hw_sectors(zv->zv_queue, UINT_MAX);
+       blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
+       blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
+       blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
+       blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
+#ifdef HAVE_BLK_QUEUE_DISCARD
+       blk_queue_max_discard_sectors(zv->zv_queue,
+           (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+       blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
+#endif
+#ifdef HAVE_BLK_QUEUE_NONROT
+       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
+#endif
 
+       if (spa_writeable(dmu_objset_spa(os))) {
+               if (zil_replay_disable)
+                       zil_destroy(dmu_objset_zil(os), B_FALSE);
+               else
+                       zil_replay(os, zv, zvol_replay_vector);
+       }
+
+       zv->zv_objset = NULL;
 out_dmu_objset_disown:
        dmu_objset_disown(os, zvol_tag);
-       zv->zv_objset = NULL;
 out_doi:
        kmem_free(doi, sizeof(dmu_object_info_t));
 out:
@@ -1176,7 +1411,7 @@ zvol_create_minor(const char *name)
        int error;
 
        mutex_enter(&zvol_state_lock);
-       error = __zvol_create_minor(name);
+       error = __zvol_create_minor(name, B_FALSE);
        mutex_exit(&zvol_state_lock);
 
        return (error);
@@ -1224,7 +1459,8 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj,
        if (strchr(dsname, '/') == NULL)
                return 0;
 
-       return __zvol_create_minor(dsname);
+       (void) __zvol_create_minor(dsname, B_FALSE);
+       return (0);
 }
 
 /*
@@ -1237,6 +1473,9 @@ zvol_create_minors(const char *pool)
        spa_t *spa = NULL;
        int error = 0;
 
+       if (zvol_inhibit_dev)
+               return (0);
+
        mutex_enter(&zvol_state_lock);
        if (pool) {
                error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb,
@@ -1266,7 +1505,10 @@ zvol_remove_minors(const char *pool)
        zvol_state_t *zv, *zv_next;
        char *str;
 
-       str = kmem_zalloc(DISK_NAME_LEN, KM_SLEEP);
+       if (zvol_inhibit_dev)
+               return;
+
+       str = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
        if (pool) {
                (void) strncpy(str, pool, strlen(pool));
                (void) strcat(str, "/");
@@ -1282,41 +1524,75 @@ zvol_remove_minors(const char *pool)
                }
        }
        mutex_exit(&zvol_state_lock);
-       kmem_free(str, DISK_NAME_LEN);
+       kmem_free(str, MAXNAMELEN);
+}
+
+static int
+snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
+       uint64_t snapdev = *(uint64_t *) arg;
+
+       if (strchr(dsname, '@') == NULL)
+               return 0;
+
+       switch (snapdev) {
+               case ZFS_SNAPDEV_VISIBLE:
+                       mutex_enter(&zvol_state_lock);
+                       (void) __zvol_create_minor(dsname, B_TRUE);
+                       mutex_exit(&zvol_state_lock);
+                       break;
+               case ZFS_SNAPDEV_HIDDEN:
+                       (void) zvol_remove_minor(dsname);
+                       break;
+       }
+       return 0;
 }
 
 int
+zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
+       (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
+               &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+       /* caller should continue to modify snapdev property */
+       return (-1);
+}
+
+
+int
 zvol_init(void)
 {
        int error;
 
-       if (!zvol_threads)
-               zvol_threads = num_online_cpus();
+       list_create(&zvol_state_list, sizeof (zvol_state_t),
+                   offsetof(zvol_state_t, zv_next));
+       mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
 
        zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri,
                                  zvol_threads, INT_MAX, TASKQ_PREPOPULATE);
        if (zvol_taskq == NULL) {
                printk(KERN_INFO "ZFS: taskq_create() failed\n");
-               return (-ENOMEM);
+               error = -ENOMEM;
+               goto out1;
        }
 
        error = register_blkdev(zvol_major, ZVOL_DRIVER);
        if (error) {
                printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
-               taskq_destroy(zvol_taskq);
-               return (error);
+               goto out2;
        }
 
        blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
                            THIS_MODULE, zvol_probe, NULL, NULL);
 
-       mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
-       list_create(&zvol_state_list, sizeof (zvol_state_t),
-                   offsetof(zvol_state_t, zv_next));
-
        (void) zvol_create_minors(NULL);
 
        return (0);
+
+out2:
+       taskq_destroy(zvol_taskq);
+out1:
+       mutex_destroy(&zvol_state_lock);
+       list_destroy(&zvol_state_list);
+
+       return (error);
 }
 
 void
@@ -1330,8 +1606,14 @@ zvol_fini(void)
        list_destroy(&zvol_state_list);
 }
 
-module_param(zvol_major, uint, 0);
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
+module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
-module_param(zvol_threads, uint, 0);
+module_param(zvol_threads, uint, 0444);
 MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once");