X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzvol.c;h=b516156372448a286f8e68c761314048d8334ac4;hb=a1d9543a39942be56879ca9338078afc77c25cea;hp=9dd9547e24b96f05261fefa5827a03acbf6ccfee;hpb=ce90208cf9e04df966429f115d8831371ea9afce;p=zfs.git diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 9dd9547..b516156 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -46,8 +46,10 @@ #include #include +unsigned int zvol_inhibit_dev = 0; unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_threads = 32; +unsigned long zvol_max_discard_blocks = 16384; static taskq_t *zvol_taskq; static kmutex_t zvol_state_lock; @@ -139,6 +141,29 @@ zvol_find_by_name(const char *name) return NULL; } + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +boolean_t +zvol_is_zvol(const char *device) +{ + struct block_device *bdev; + unsigned int major; + + bdev = lookup_bdev(device); + if (IS_ERR(bdev)) + return (B_FALSE); + + major = MAJOR(bdev->bd_dev); + bdput(bdev); + + if (major == zvol_major) + return (B_TRUE); + + return (B_FALSE); +} + /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ @@ -428,20 +453,20 @@ zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) * Callback vectors for replaying records. * Only TX_WRITE is needed for zvol. */ -zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { - (zil_replay_func_t *)zvol_replay_err, /* no such transaction type */ - (zil_replay_func_t *)zvol_replay_err, /* TX_CREATE */ - (zil_replay_func_t *)zvol_replay_err, /* TX_MKDIR */ - (zil_replay_func_t *)zvol_replay_err, /* TX_MKXATTR */ - (zil_replay_func_t *)zvol_replay_err, /* TX_SYMLINK */ - (zil_replay_func_t *)zvol_replay_err, /* TX_REMOVE */ - (zil_replay_func_t *)zvol_replay_err, /* TX_RMDIR */ - (zil_replay_func_t *)zvol_replay_err, /* TX_LINK */ - (zil_replay_func_t *)zvol_replay_err, /* TX_RENAME */ - (zil_replay_func_t *)zvol_replay_write, /* TX_WRITE */ - (zil_replay_func_t *)zvol_replay_err, /* TX_TRUNCATE */ - (zil_replay_func_t *)zvol_replay_err, /* TX_SETATTR */ - (zil_replay_func_t *)zvol_replay_err, /* TX_ACL */ +zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = { + (zil_replay_func_t)zvol_replay_err, /* no such transaction type */ + (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */ + (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */ + (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */ + (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */ + (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */ + (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */ + (zil_replay_func_t)zvol_replay_err, /* TX_LINK */ + (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */ + (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */ + (zil_replay_func_t)zvol_replay_err, /* TX_TRUNCATE */ + (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */ + (zil_replay_func_t)zvol_replay_err, /* TX_ACL */ }; /* @@ -459,11 +484,15 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; boolean_t slogging; + ssize_t immediate_write_sz; if (zil_replaying(zilog, tx)) return; - slogging = spa_has_slogs(zilog->zl_spa); + immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + ? 0 : zvol_immediate_write_sz; + slogging = spa_has_slogs(zilog->zl_spa) && + (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); while (size) { itx_t *itx; @@ -475,7 +504,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, * Unlike zfs_log_write() we can be called with * up to DMU_MAX_ACCESS/2 (5MB) writes. */ - if (blocksize > zvol_immediate_write_sz && !slogging && + if (blocksize > immediate_write_sz && !slogging && size >= blocksize && offset % blocksize == 0) { write_state = WR_INDIRECT; /* uses dmu_sync */ len = blocksize; @@ -534,6 +563,14 @@ zvol_write(void *arg) dmu_tx_t *tx; rl_t *rl; + /* + * Annotate this call path with a flag that indicates that it is + * unsafe to use KM_SLEEP during memory allocations due to the + * potential for a deadlock. KM_PUSHPAGE should be used instead. + */ + ASSERT(!(current->flags & PF_NOFS)); + current->flags |= PF_NOFS; + if (req->cmd_flags & VDEV_REQ_FLUSH) zil_commit(zv->zv_zilog, ZVOL_OBJ); @@ -542,7 +579,7 @@ zvol_write(void *arg) */ if (size == 0) { blk_end_request(req, 0, size); - return; + goto out; } rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); @@ -556,7 +593,7 @@ zvol_write(void *arg) dmu_tx_abort(tx); zfs_range_unlock(rl); blk_end_request(req, -error, size); - return; + goto out; } error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); @@ -572,6 +609,8 @@ zvol_write(void *arg) zil_commit(zv->zv_zilog, ZVOL_OBJ); blk_end_request(req, -error, size); +out: + current->flags &= ~PF_NOFS; } #ifdef HAVE_BLK_QUEUE_DISCARD @@ -581,24 +620,41 @@ zvol_discard(void *arg) struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; - uint64_t offset = blk_rq_pos(req) << 9; - uint64_t size = blk_rq_bytes(req); + uint64_t start = blk_rq_pos(req) << 9; + uint64_t end = start + blk_rq_bytes(req); int error; rl_t *rl; - if (offset + size > zv->zv_volsize) { - blk_end_request(req, -EIO, size); - return; + /* + * Annotate this call path with a flag that indicates that it is + * unsafe to use KM_SLEEP during memory allocations due to the + * potential for a deadlock. KM_PUSHPAGE should be used instead. + */ + ASSERT(!(current->flags & PF_NOFS)); + current->flags |= PF_NOFS; + + if (end > zv->zv_volsize) { + blk_end_request(req, -EIO, blk_rq_bytes(req)); + goto out; } - if (size == 0) { - blk_end_request(req, 0, size); - return; + /* + * Align the request to volume block boundaries. If we don't, + * then this will force dnode_free_range() to zero out the + * unaligned parts, which is slow (read-modify-write) and + * useless since we are not freeing any space by doing so. + */ + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); + + if (start >= end) { + blk_end_request(req, 0, blk_rq_bytes(req)); + goto out; } - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); + rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); - error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, size); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); /* * TODO: maybe we should add the operation to the log. @@ -606,7 +662,9 @@ zvol_discard(void *arg) zfs_range_unlock(rl); - blk_end_request(req, -error, size); + blk_end_request(req, -error, blk_rq_bytes(req)); +out: + current->flags &= ~PF_NOFS; } #endif /* HAVE_BLK_QUEUE_DISCARD */ @@ -759,7 +817,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) ASSERT(zio != NULL); ASSERT(size != 0); - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE); zgd->zgd_zilog = zv->zv_zilog; zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); @@ -833,25 +891,49 @@ zvol_first_open(zvol_state_t *zv) { objset_t *os; uint64_t volsize; + int locked = 0; int error; uint64_t ro; + /* + * In all other cases the spa_namespace_lock is taken before the + * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() + * function calls fops->open() with the bdev->bd_mutex lock held. + * + * To avoid a potential lock inversion deadlock we preemptively + * try to take the spa_namespace_lock(). Normally it will not + * be contended and this is safe because spa_open_common() handles + * the case where the caller already holds the spa_namespace_lock. + * + * When it is contended we risk a lock inversion if we were to + * block waiting for the lock. Luckily, the __blkdev_get() + * function allows us to return -ERESTARTSYS which will result in + * bdev->bd_mutex being dropped, reacquired, and fops->open() being + * called again. This process can be repeated safely until both + * locks are acquired. + */ + if (!mutex_owned(&spa_namespace_lock)) { + locked = mutex_tryenter(&spa_namespace_lock); + if (!locked) + return (-ERESTARTSYS); + } + /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); if (error) - return (-error); + goto out_mutex; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { - dmu_objset_disown(os, zvol_tag); - return (-error); + dmu_objset_disown(os, zvol_tag); + goto out_mutex; } zv->zv_objset = os; error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { - dmu_objset_disown(os, zvol_tag); - return (-error); + dmu_objset_disown(os, zvol_tag); + goto out_mutex; } set_capacity(zv->zv_disk, volsize >> 9); @@ -859,14 +941,19 @@ zvol_first_open(zvol_state_t *zv) zv->zv_zilog = zil_open(os, zvol_get_data); VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0); - if (ro || dmu_objset_is_snapshot(os)) { - set_disk_ro(zv->zv_disk, 1); - zv->zv_flags |= ZVOL_RDONLY; + if (ro || dmu_objset_is_snapshot(os) || + !spa_writeable(dmu_objset_spa(os))) { + set_disk_ro(zv->zv_disk, 1); + zv->zv_flags |= ZVOL_RDONLY; } else { - set_disk_ro(zv->zv_disk, 0); - zv->zv_flags &= ~ZVOL_RDONLY; + set_disk_ro(zv->zv_disk, 0); + zv->zv_flags &= ~ZVOL_RDONLY; } +out_mutex: + if (locked) + mutex_exit(&spa_namespace_lock); + return (-error); } @@ -875,8 +962,18 @@ zvol_last_close(zvol_state_t *zv) { zil_close(zv->zv_zilog); zv->zv_zilog = NULL; + dmu_buf_rele(zv->zv_dbuf, zvol_tag); zv->zv_dbuf = NULL; + + /* + * Evict cached data + */ + if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && + !(zv->zv_flags & ZVOL_RDONLY)) + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + (void) dmu_objset_evict_dbufs(zv->zv_objset); + dmu_objset_disown(zv->zv_objset, zvol_tag); zv->zv_objset = NULL; } @@ -927,7 +1024,11 @@ out_mutex: return (error); } +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID +static void +#else static int +#endif zvol_release(struct gendisk *disk, fmode_t mode) { zvol_state_t *zv = disk->private_data; @@ -947,7 +1048,9 @@ zvol_release(struct gendisk *disk, fmode_t mode) if (drop_mutex) mutex_exit(&zvol_state_lock); +#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID return (0); +#endif } static int @@ -1039,7 +1142,7 @@ zvol_probe(dev_t dev, int *part, void *arg) mutex_enter(&zvol_state_lock); zv = zvol_find_by_dev(dev); - kobj = zv ? get_disk(zv->zv_disk) : ERR_PTR(-ENOENT); + kobj = zv ? get_disk(zv->zv_disk) : NULL; mutex_exit(&zvol_state_lock); return kobj; @@ -1114,15 +1217,26 @@ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; + int error = 0; zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); - if (zv == NULL) - goto out; + + spin_lock_init(&zv->zv_lock); + list_link_init(&zv->zv_next); zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock); if (zv->zv_queue == NULL) goto out_kmem; +#ifdef HAVE_ELEVATOR_CHANGE + error = elevator_change(zv->zv_queue, "noop"); +#endif /* HAVE_ELEVATOR_CHANGE */ + if (error) { + printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n", + "noop", name, error); + goto out_queue; + } + #ifdef HAVE_BLK_QUEUE_FLUSH blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA); #else @@ -1143,9 +1257,6 @@ zvol_alloc(dev_t dev, const char *name) sizeof (rl_t), offsetof(rl_t, r_node)); zv->zv_znode.z_is_zvol = TRUE; - spin_lock_init(&zv->zv_lock); - list_link_init(&zv->zv_next); - zv->zv_disk->major = zvol_major; zv->zv_disk->first_minor = (dev & MINORMASK); zv->zv_disk->fops = &zvol_ops; @@ -1160,7 +1271,7 @@ out_queue: blk_cleanup_queue(zv->zv_queue); out_kmem: kmem_free(zv, sizeof (zvol_state_t)); -out: + return NULL; } @@ -1181,7 +1292,28 @@ zvol_free(zvol_state_t *zv) } static int -__zvol_create_minor(const char *name) +__zvol_snapdev_hidden(const char *name) +{ + uint64_t snapdev; + char *parent; + char *atp; + int error = 0; + + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strlcpy(parent, name, MAXPATHLEN); + + if ((atp = strrchr(parent, '@')) != NULL) { + *atp = '\0'; + error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); + if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) + error = ENODEV; + } + kmem_free(parent, MAXPATHLEN); + return (error); +} + +static int +__zvol_create_minor(const char *name, boolean_t ignore_snapdev) { zvol_state_t *zv; objset_t *os; @@ -1198,6 +1330,12 @@ __zvol_create_minor(const char *name) goto out; } + if (ignore_snapdev == B_FALSE) { + error = __zvol_snapdev_hidden(name); + if (error) + goto out; + } + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); @@ -1237,21 +1375,25 @@ __zvol_create_minor(const char *name) blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); #ifdef HAVE_BLK_QUEUE_DISCARD - blk_queue_max_discard_sectors(zv->zv_queue, UINT_MAX); + blk_queue_max_discard_sectors(zv->zv_queue, + (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); + blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); #endif #ifdef HAVE_BLK_QUEUE_NONROT queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); #endif - if (zil_replay_disable) - zil_destroy(dmu_objset_zil(os), B_FALSE); - else - zil_replay(os, zv, zvol_replay_vector); + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, zvol_tag); - zv->zv_objset = NULL; out_doi: kmem_free(doi, sizeof(dmu_object_info_t)); out: @@ -1275,7 +1417,7 @@ zvol_create_minor(const char *name) int error; mutex_enter(&zvol_state_lock); - error = __zvol_create_minor(name); + error = __zvol_create_minor(name, B_FALSE); mutex_exit(&zvol_state_lock); return (error); @@ -1323,7 +1465,7 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, if (strchr(dsname, '/') == NULL) return 0; - (void) __zvol_create_minor(dsname); + (void) __zvol_create_minor(dsname, B_FALSE); return (0); } @@ -1337,6 +1479,9 @@ zvol_create_minors(const char *pool) spa_t *spa = NULL; int error = 0; + if (zvol_inhibit_dev) + return (0); + mutex_enter(&zvol_state_lock); if (pool) { error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, @@ -1366,6 +1511,9 @@ zvol_remove_minors(const char *pool) zvol_state_t *zv, *zv_next; char *str; + if (zvol_inhibit_dev) + return; + str = kmem_zalloc(MAXNAMELEN, KM_SLEEP); if (pool) { (void) strncpy(str, pool, strlen(pool)); @@ -1385,41 +1533,70 @@ zvol_remove_minors(const char *pool) kmem_free(str, MAXNAMELEN); } +static int +snapdev_snapshot_changed_cb(const char *dsname, void *arg) { + uint64_t snapdev = *(uint64_t *) arg; + + if (strchr(dsname, '@') == NULL) + return 0; + + switch (snapdev) { + case ZFS_SNAPDEV_VISIBLE: + mutex_enter(&zvol_state_lock); + (void) __zvol_create_minor(dsname, B_TRUE); + mutex_exit(&zvol_state_lock); + break; + case ZFS_SNAPDEV_HIDDEN: + (void) zvol_remove_minor(dsname); + break; + } + return 0; +} + +int +zvol_set_snapdev(const char *dsname, uint64_t snapdev) { + (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb, + &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + /* caller should continue to modify snapdev property */ + return (-1); +} + + int zvol_init(void) { int error; - /* - * The zvol taskqs are created with TASKQ_NORECLAIM so they may be - * used safely as a swap device. If direct reclaim is allowed then - * they quickly deadlock in one of the internal memory allocations. - */ + list_create(&zvol_state_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); + mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri, - zvol_threads, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_NORECLAIM); + zvol_threads, INT_MAX, TASKQ_PREPOPULATE); if (zvol_taskq == NULL) { printk(KERN_INFO "ZFS: taskq_create() failed\n"); - return (-ENOMEM); + error = -ENOMEM; + goto out1; } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); - taskq_destroy(zvol_taskq); - return (error); + goto out2; } blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, THIS_MODULE, zvol_probe, NULL, NULL); - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zvol_state_list, sizeof (zvol_state_t), - offsetof(zvol_state_t, zv_next)); + return (0); - (void) zvol_create_minors(NULL); +out2: + taskq_destroy(zvol_taskq); +out1: + mutex_destroy(&zvol_state_lock); + list_destroy(&zvol_state_list); - return (0); + return (error); } void @@ -1433,8 +1610,14 @@ zvol_fini(void) list_destroy(&zvol_state_list); } +module_param(zvol_inhibit_dev, uint, 0644); +MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); + module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device"); + +module_param(zvol_max_discard_blocks, ulong, 0444); +MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once");