X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzvol.c;h=c6cfac26d4afbb2e42576f87f77023abc32e8485;hb=36f86f73f68548f46eb3229c8adf583d59fa9988;hp=5668e1dc0c32c4e1346c54f74fd7de7ff700b928;hpb=04434775b7f3aa55fbbcf2064cfb9f5f5c436e64;p=zfs.git diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 5668e1d..c6cfac2 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -141,6 +141,29 @@ zvol_find_by_name(const char *name) return NULL; } + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +boolean_t +zvol_is_zvol(const char *device) +{ + struct block_device *bdev; + unsigned int major; + + bdev = lookup_bdev(device); + if (IS_ERR(bdev)) + return (B_FALSE); + + major = MAJOR(bdev->bd_dev); + bdput(bdev); + + if (major == zvol_major) + return (B_TRUE); + + return (B_FALSE); +} + /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ @@ -597,8 +620,8 @@ zvol_discard(void *arg) struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; - uint64_t offset = blk_rq_pos(req) << 9; - uint64_t size = blk_rq_bytes(req); + uint64_t start = blk_rq_pos(req) << 9; + uint64_t end = start + blk_rq_bytes(req); int error; rl_t *rl; @@ -610,19 +633,28 @@ zvol_discard(void *arg) ASSERT(!(current->flags & PF_NOFS)); current->flags |= PF_NOFS; - if (offset + size > zv->zv_volsize) { - blk_end_request(req, -EIO, size); + if (end > zv->zv_volsize) { + blk_end_request(req, -EIO, blk_rq_bytes(req)); goto out; } - if (size == 0) { - blk_end_request(req, 0, size); + /* + * Align the request to volume block boundaries. If we don't, + * then this will force dnode_free_range() to zero out the + * unaligned parts, which is slow (read-modify-write) and + * useless since we are not freeing any space by doing so. + */ + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); + + if (start >= end) { + blk_end_request(req, 0, blk_rq_bytes(req)); goto out; } - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); + rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); - error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, size); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); /* * TODO: maybe we should add the operation to the log. @@ -630,7 +662,7 @@ zvol_discard(void *arg) zfs_range_unlock(rl); - blk_end_request(req, -error, size); + blk_end_request(req, -error, blk_rq_bytes(req)); out: current->flags &= ~PF_NOFS; } @@ -859,25 +891,49 @@ zvol_first_open(zvol_state_t *zv) { objset_t *os; uint64_t volsize; + int locked = 0; int error; uint64_t ro; + /* + * In all other cases the spa_namespace_lock is taken before the + * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() + * function calls fops->open() with the bdev->bd_mutex lock held. + * + * To avoid a potential lock inversion deadlock we preemptively + * try to take the spa_namespace_lock(). Normally it will not + * be contended and this is safe because spa_open_common() handles + * the case where the caller already holds the spa_namespace_lock. + * + * When it is contended we risk a lock inversion if we were to + * block waiting for the lock. Luckily, the __blkdev_get() + * function allows us to return -ERESTARTSYS which will result in + * bdev->bd_mutex being dropped, reacquired, and fops->open() being + * called again. This process can be repeated safely until both + * locks are acquired. + */ + if (!mutex_owned(&spa_namespace_lock)) { + locked = mutex_tryenter(&spa_namespace_lock); + if (!locked) + return (-ERESTARTSYS); + } + /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); if (error) - return (-error); + goto out_mutex; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { - dmu_objset_disown(os, zvol_tag); - return (-error); + dmu_objset_disown(os, zvol_tag); + goto out_mutex; } zv->zv_objset = os; error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { - dmu_objset_disown(os, zvol_tag); - return (-error); + dmu_objset_disown(os, zvol_tag); + goto out_mutex; } set_capacity(zv->zv_disk, volsize >> 9); @@ -886,13 +942,17 @@ zvol_first_open(zvol_state_t *zv) VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0); if (ro || dmu_objset_is_snapshot(os)) { - set_disk_ro(zv->zv_disk, 1); - zv->zv_flags |= ZVOL_RDONLY; + set_disk_ro(zv->zv_disk, 1); + zv->zv_flags |= ZVOL_RDONLY; } else { - set_disk_ro(zv->zv_disk, 0); - zv->zv_flags &= ~ZVOL_RDONLY; + set_disk_ro(zv->zv_disk, 0); + zv->zv_flags &= ~ZVOL_RDONLY; } +out_mutex: + if (locked) + mutex_exit(&spa_namespace_lock); + return (-error); } @@ -1150,6 +1210,7 @@ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; + int error = 0; zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); if (zv == NULL) @@ -1159,6 +1220,15 @@ zvol_alloc(dev_t dev, const char *name) if (zv->zv_queue == NULL) goto out_kmem; +#ifdef HAVE_ELEVATOR_CHANGE + error = elevator_change(zv->zv_queue, "noop"); +#endif /* HAVE_ELEVATOR_CHANGE */ + if (error) { + printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n", + "noop", name, error); + goto out_queue; + } + #ifdef HAVE_BLK_QUEUE_FLUSH blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA); #else @@ -1287,9 +1357,9 @@ __zvol_create_minor(const char *name) else zil_replay(os, zv, zvol_replay_vector); + zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, zvol_tag); - zv->zv_objset = NULL; out_doi: kmem_free(doi, sizeof(dmu_object_info_t)); out: