X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzvol.c;h=b516156372448a286f8e68c761314048d8334ac4;hb=a1d9543a39942be56879ca9338078afc77c25cea;hp=0206dad9ee3f686db78f4b9cbab05a4c94490bb0;hpb=fb5f0bc83330c8a0236c4d34a23723ac1974971a;p=zfs.git diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 0206dad..b516156 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -19,347 +19,398 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* + * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Rewritten for Linux by Brian Behlendorf . + * LLNL-CODE-403049. + * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * - * /dev/zvol/dsk// - * /dev/zvol/rdsk// + * /dev// * - * These links are created by the ZFS-specific devfsadm link generator. - * Volumes are persistent through reboot. No user command needs to be - * run before opening and using a device. + * Volumes are persistent through reboot and module load. No user command + * needs to be run before opening and using a device. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include -#include -#include +#include #include -#include -#include +#include -#include "zfs_namecheck.h" +unsigned int zvol_inhibit_dev = 0; +unsigned int zvol_major = ZVOL_MAJOR; +unsigned int zvol_threads = 32; +unsigned long zvol_max_discard_blocks = 16384; -static void *zvol_state; - -#define ZVOL_DUMPSIZE "dumpsize" - -/* - * This lock protects the zvol_state structure from being modified - * while it's being used, e.g. an open that comes in before a create - * finishes. It also protects temporary opens of the dataset so that, - * e.g., an open doesn't get a spurious EBUSY. - */ +static taskq_t *zvol_taskq; static kmutex_t zvol_state_lock; -static uint32_t zvol_minors; - -typedef struct zvol_extent { - list_node_t ze_node; - dva_t ze_dva; /* dva associated with this extent */ - uint64_t ze_nblks; /* number of blocks in extent */ -} zvol_extent_t; +static list_t zvol_state_list; +static char *zvol_tag = "zvol_tag"; /* * The in-core state of each volume. */ typedef struct zvol_state { - char zv_name[MAXPATHLEN]; /* pool/dd name */ - uint64_t zv_volsize; /* amount of space we advertise */ - uint64_t zv_volblocksize; /* volume block size */ - minor_t zv_minor; /* minor number */ - uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_flags; /* readonly; dumpified */ - objset_t *zv_objset; /* objset handle */ - uint32_t zv_mode; /* DS_MODE_* flags at open time */ - uint32_t zv_open_count[OTYPCNT]; /* open counts */ - uint32_t zv_total_opens; /* total open count */ - zilog_t *zv_zilog; /* ZIL handle */ - list_t zv_extents; /* List of extents for dump */ - znode_t zv_znode; /* for range locking */ + char zv_name[MAXNAMELEN]; /* name */ + uint64_t zv_volsize; /* advertised space */ + uint64_t zv_volblocksize;/* volume block size */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_flags; /* ZVOL_* flags */ + uint32_t zv_open_count; /* open counts */ + uint32_t zv_changed; /* disk changed */ + zilog_t *zv_zilog; /* ZIL handle */ + znode_t zv_znode; /* for range locking */ + dmu_buf_t *zv_dbuf; /* bonus handle */ + dev_t zv_dev; /* device id */ + struct gendisk *zv_disk; /* generic disk */ + struct request_queue *zv_queue; /* request queue */ + spinlock_t zv_lock; /* request queue lock */ + list_node_t zv_next; /* next zvol_state_t linkage */ } zvol_state_t; -/* - * zvol specific flags - */ #define ZVOL_RDONLY 0x1 -#define ZVOL_DUMPIFIED 0x2 -#define ZVOL_EXCL 0x4 /* - * zvol maximum transfer in one DMU tx. + * Find the next available range of ZVOL_MINORS minor numbers. The + * zvol_state_list is kept in ascending minor order so we simply need + * to scan the list for the first gap in the sequence. This allows us + * to recycle minor number as devices are created and removed. */ -int zvol_maxphys = DMU_MAX_ACCESS/2; +static int +zvol_find_minor(unsigned *minor) +{ + zvol_state_t *zv; + + *minor = 0; + ASSERT(MUTEX_HELD(&zvol_state_lock)); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { + if (MINOR(zv->zv_dev) != MINOR(*minor)) + break; + } -extern int zfs_set_prop_nvlist(const char *, nvlist_t *); -static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); -static int zvol_dumpify(zvol_state_t *zv); -static int zvol_dump_fini(zvol_state_t *zv); -static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); + /* All minors are in use */ + if (*minor >= (1 << MINORBITS)) + return ENXIO; -static void -zvol_size_changed(zvol_state_t *zv, major_t maj) + return 0; +} + +/* + * Find a zvol_state_t given the full major+minor dev_t. + */ +static zvol_state_t * +zvol_find_by_dev(dev_t dev) { - dev_t dev = makedevice(maj, zv->zv_minor); + zvol_state_t *zv; - VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Size", zv->zv_volsize) == DDI_SUCCESS); - VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); + ASSERT(MUTEX_HELD(&zvol_state_lock)); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + if (zv->zv_dev == dev) + return zv; + } - /* Notify specfs to invalidate the cached size */ - spec_size_invalidate(dev, VBLK); - spec_size_invalidate(dev, VCHR); + return NULL; } -int -zvol_check_volsize(uint64_t volsize, uint64_t blocksize) +/* + * Find a zvol_state_t given the name provided at zvol_alloc() time. + */ +static zvol_state_t * +zvol_find_by_name(const char *name) { - if (volsize == 0) - return (EINVAL); + zvol_state_t *zv; - if (volsize % blocksize != 0) - return (EINVAL); + ASSERT(MUTEX_HELD(&zvol_state_lock)); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + if (!strncmp(zv->zv_name, name, MAXNAMELEN)) + return zv; + } -#ifdef _ILP32 - if (volsize - 1 > SPEC_MAXOFFSET_T) - return (EOVERFLOW); -#endif - return (0); + return NULL; } -int -zvol_check_volblocksize(uint64_t volblocksize) + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +boolean_t +zvol_is_zvol(const char *device) { - if (volblocksize < SPA_MINBLOCKSIZE || - volblocksize > SPA_MAXBLOCKSIZE || - !ISP2(volblocksize)) - return (EDOM); + struct block_device *bdev; + unsigned int major; - return (0); + bdev = lookup_bdev(device); + if (IS_ERR(bdev)) + return (B_FALSE); + + major = MAJOR(bdev->bd_dev); + bdput(bdev); + + if (major == zvol_major) + return (B_TRUE); + + return (B_FALSE); } -static void -zvol_readonly_changed_cb(void *arg, uint64_t newval) +/* + * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. + */ +void +zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { - zvol_state_t *zv = arg; + zfs_creat_t *zct = arg; + nvlist_t *nvprops = zct->zct_props; + int error; + uint64_t volblocksize, volsize; - if (newval) - zv->zv_flags |= ZVOL_RDONLY; - else - zv->zv_flags &= ~ZVOL_RDONLY; + VERIFY(nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) + volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + + /* + * These properties must be removed from the list so the generic + * property setting step won't apply to them. + */ + VERIFY(nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + (void) nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); + ASSERT(error == 0); } +/* + * ZFS_IOC_OBJSET_STATS entry point. + */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; - dmu_object_info_t doi; + dmu_object_info_t *doi; uint64_t val; - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (error); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); - - error = dmu_object_info(os, ZVOL_OBJ, &doi); + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); + error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, - doi.doi_data_block_size); + doi->doi_data_block_size); } + kmem_free(doi, sizeof(dmu_object_info_t)); + return (error); } /* - * Find a free minor number. + * Sanity check volume size. */ -static minor_t -zvol_minor_alloc(void) +int +zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { - minor_t minor; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); + if (volsize == 0) + return (EINVAL); - for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) - if (ddi_get_soft_state(zvol_state, minor) == NULL) - return (minor); + if (volsize % blocksize != 0) + return (EINVAL); +#ifdef _ILP32 + if (volsize - 1 > MAXOFFSET_T) + return (EOVERFLOW); +#endif return (0); } -static zvol_state_t * -zvol_minor_lookup(const char *name) +/* + * Ensure the zap is flushed then inform the VFS of the capacity change. + */ +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) { - minor_t minor; - zvol_state_t *zv; + struct block_device *bdev; + dmu_tx_t *tx; + int error; ASSERT(MUTEX_HELD(&zvol_state_lock)); - for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - continue; - if (strcmp(zv->zv_name, name) == 0) - break; + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); } - return (zv); -} + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + dmu_tx_commit(tx); -/* extent mapping arg */ -struct maparg { - zvol_state_t *ma_zv; - uint64_t ma_blks; -}; + if (error) + return (error); -/*ARGSUSED*/ -static int -zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) -{ - struct maparg *ma = arg; - zvol_extent_t *ze; - int bs = ma->ma_zv->zv_volblocksize; + error = dmu_free_long_range(os, + ZVOL_OBJ, volsize, DMU_OBJECT_END); + if (error) + return (error); - if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) - return (0); + bdev = bdget_disk(zv->zv_disk, 0); + if (!bdev) + return (EIO); +/* + * 2.6.28 API change + * Added check_disk_size_change() helper function. + */ +#ifdef HAVE_CHECK_DISK_SIZE_CHANGE + set_capacity(zv->zv_disk, volsize >> 9); + zv->zv_volsize = volsize; + check_disk_size_change(zv->zv_disk, bdev); +#else + zv->zv_volsize = volsize; + zv->zv_changed = 1; + (void) check_disk_change(bdev); +#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */ - VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); - ma->ma_blks++; + bdput(bdev); - /* Abort immediately if we have encountered gang blocks */ - if (BP_IS_GANG(bp)) - return (EFRAGS); + return (0); +} - /* - * See if the block is at the end of the previous extent. - */ - ze = list_tail(&ma->ma_zv->zv_extents); - if (ze && - DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && - DVA_GET_OFFSET(BP_IDENTITY(bp)) == - DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { - ze->ze_nblks++; - return (0); +/* + * Set ZFS_PROP_VOLSIZE set entry point. + */ +int +zvol_set_volsize(const char *name, uint64_t volsize) +{ + zvol_state_t *zv; + dmu_object_info_t *doi; + objset_t *os = NULL; + uint64_t readonly; + int error; + + mutex_enter(&zvol_state_lock); + + zv = zvol_find_by_name(name); + if (zv == NULL) { + error = ENXIO; + goto out; } - dprintf_bp(bp, "%s", "next blkptr:"); + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); - /* start a new extent */ - ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); - ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ - ze->ze_nblks = 1; - list_insert_tail(&ma->ma_zv->zv_extents, ze); - return (0); -} + error = dmu_objset_hold(name, FTAG, &os); + if (error) + goto out_doi; -static void -zvol_free_extents(zvol_state_t *zv) -{ - zvol_extent_t *ze; + if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) != 0 || + (error = zvol_check_volsize(volsize,doi->doi_data_block_size)) != 0) + goto out_doi; + + VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, NULL) == 0); + if (readonly) { + error = EROFS; + goto out_doi; + } - while (ze = list_head(&zv->zv_extents)) { - list_remove(&zv->zv_extents, ze); - kmem_free(ze, sizeof (zvol_extent_t)); + if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { + error = EROFS; + goto out_doi; } + + error = zvol_update_volsize(zv, volsize, os); +out_doi: + kmem_free(doi, sizeof(dmu_object_info_t)); +out: + if (os) + dmu_objset_rele(os, FTAG); + + mutex_exit(&zvol_state_lock); + + return (error); } -static int -zvol_get_lbas(zvol_state_t *zv) +/* + * Sanity check volume block size. + */ +int +zvol_check_volblocksize(uint64_t volblocksize) { - struct maparg ma; - int err; - - ma.ma_zv = zv; - ma.ma_blks = 0; - zvol_free_extents(zv); - - err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); - if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { - zvol_free_extents(zv); - return (err ? err : EIO); - } + if (volblocksize < SPA_MINBLOCKSIZE || + volblocksize > SPA_MAXBLOCKSIZE || + !ISP2(volblocksize)) + return (EDOM); return (0); } -/* ARGSUSED */ -void -zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +/* + * Set ZFS_PROP_VOLBLOCKSIZE set entry point. + */ +int +zvol_set_volblocksize(const char *name, uint64_t volblocksize) { - zfs_creat_t *zct = arg; - nvlist_t *nvprops = zct->zct_props; + zvol_state_t *zv; + dmu_tx_t *tx; int error; - uint64_t volblocksize, volsize; - VERIFY(nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); - if (nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) - volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + mutex_enter(&zvol_state_lock); - /* - * These properties must be removed from the list so the generic - * property setting step won't apply to them. - */ - VERIFY(nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); - (void) nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + zv = zvol_find_by_name(name); + if (zv == NULL) { + error = ENXIO; + goto out; + } - error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); + if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { + error = EROFS; + goto out; + } - error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, + volblocksize, 0, tx); + if (error == ENOTSUP) + error = EBUSY; + dmu_tx_commit(tx); + if (error == 0) + zv->zv_volblocksize = volblocksize; + } +out: + mutex_exit(&zvol_state_lock); - error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); - ASSERT(error == 0); + return (error); } /* @@ -392,7 +443,6 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) return (error); } -/* ARGSUSED */ static int zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) { @@ -403,498 +453,350 @@ zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) * Callback vectors for replaying records. * Only TX_WRITE is needed for zvol. */ -zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { - zvol_replay_err, /* 0 no such transaction type */ - zvol_replay_err, /* TX_CREATE */ - zvol_replay_err, /* TX_MKDIR */ - zvol_replay_err, /* TX_MKXATTR */ - zvol_replay_err, /* TX_SYMLINK */ - zvol_replay_err, /* TX_REMOVE */ - zvol_replay_err, /* TX_RMDIR */ - zvol_replay_err, /* TX_LINK */ - zvol_replay_err, /* TX_RENAME */ - zvol_replay_write, /* TX_WRITE */ - zvol_replay_err, /* TX_TRUNCATE */ - zvol_replay_err, /* TX_SETATTR */ - zvol_replay_err, /* TX_ACL */ +zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = { + (zil_replay_func_t)zvol_replay_err, /* no such transaction type */ + (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */ + (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */ + (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */ + (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */ + (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */ + (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */ + (zil_replay_func_t)zvol_replay_err, /* TX_LINK */ + (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */ + (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */ + (zil_replay_func_t)zvol_replay_err, /* TX_TRUNCATE */ + (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */ + (zil_replay_func_t)zvol_replay_err, /* TX_ACL */ }; /* - * Create a minor node (plus a whole lot more) for the specified volume. + * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. + * + * We store data in the log buffers if it's small enough. + * Otherwise we will later flush the data out via dmu_sync(). */ -int -zvol_create_minor(const char *name, major_t maj) -{ - zvol_state_t *zv; - objset_t *os; - dmu_object_info_t doi; - uint64_t volsize; - minor_t minor = 0; - struct pathname linkpath; - int ds_mode = DS_MODE_OWNER; - vnode_t *vp = NULL; - char *devpath; - size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1; - char chrbuf[30], blkbuf[30]; - int error; +ssize_t zvol_immediate_write_sz = 32768; - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) != NULL) { - mutex_exit(&zvol_state_lock); - return (EEXIST); - } - - if (strchr(name, '@') != 0) - ds_mode |= DS_MODE_READONLY; - - error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); - - if (error) { - mutex_exit(&zvol_state_lock); - return (error); - } - - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - - if (error) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (error); - } - - /* - * If there's an existing /dev/zvol symlink, try to use the - * same minor number we used last time. - */ - devpath = kmem_alloc(devpathlen, KM_SLEEP); - - (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name); +static void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, + uint64_t offset, uint64_t size, int sync) +{ + uint32_t blocksize = zv->zv_volblocksize; + zilog_t *zilog = zv->zv_zilog; + boolean_t slogging; + ssize_t immediate_write_sz; - error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); + if (zil_replaying(zilog, tx)) + return; - kmem_free(devpath, devpathlen); + immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + ? 0 : zvol_immediate_write_sz; + slogging = spa_has_slogs(zilog->zl_spa) && + (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); - if (error == 0 && vp->v_type != VLNK) - error = EINVAL; + while (size) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + itx_wr_state_t write_state; - if (error == 0) { - pn_alloc(&linkpath); - error = pn_getsymlink(vp, &linkpath, kcred); - if (error == 0) { - char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); - if (ms != NULL) { - ms += strlen(ZVOL_PSEUDO_DEV); - minor = stoi(&ms); - } + /* + * Unlike zfs_log_write() we can be called with + * up to DMU_MAX_ACCESS/2 (5MB) writes. + */ + if (blocksize > immediate_write_sz && !slogging && + size >= blocksize && offset % blocksize == 0) { + write_state = WR_INDIRECT; /* uses dmu_sync */ + len = blocksize; + } else if (sync) { + write_state = WR_COPIED; + len = MIN(ZIL_MAX_LOG_DATA, size); + } else { + write_state = WR_NEED_COPY; + len = MIN(ZIL_MAX_LOG_DATA, size); } - pn_free(&linkpath); - } - if (vp != NULL) - VN_RELE(vp); - - /* - * If we found a minor but it's already in use, we must pick a new one. - */ - if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) - minor = 0; - - if (minor == 0) - minor = zvol_minor_alloc(); - - if (minor == 0) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (EAGAIN); - } - - (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, - (char *)name); + itx = zil_itx_create(TX_WRITE, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zv->zv_objset, + ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } - (void) sprintf(chrbuf, "%uc,raw", minor); + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = offset; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); - if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_free(zvol_state, minor); - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (EAGAIN); - } + itx->itx_private = zv; + itx->itx_sync = sync; - (void) sprintf(blkbuf, "%uc", minor); + (void) zil_itx_assign(zilog, itx, tx); - if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_remove_minor_node(zfs_dip, chrbuf); - ddi_soft_state_free(zvol_state, minor); - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (EAGAIN); + offset += len; + size -= len; } - - zv = ddi_get_soft_state(zvol_state, minor); - - (void) strcpy(zv->zv_name, name); - zv->zv_min_bs = DEV_BSHIFT; - zv->zv_minor = minor; - zv->zv_volsize = volsize; - zv->zv_objset = os; - zv->zv_mode = ds_mode; - zv->zv_zilog = zil_open(os, zvol_get_data); - mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); - list_create(&zv->zv_extents, sizeof (zvol_extent_t), - offsetof(zvol_extent_t, ze_node)); - /* get and cache the blocksize */ - error = dmu_object_info(os, ZVOL_OBJ, &doi); - ASSERT(error == 0); - zv->zv_volblocksize = doi.doi_data_block_size; - - zil_replay(os, zv, zvol_replay_vector); - zvol_size_changed(zv, maj); - - /* XXX this should handle the possible i/o error */ - VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); - - zvol_minors++; - - mutex_exit(&zvol_state_lock); - - return (0); } /* - * Remove minor node for the specified volume. + * Common write path running under the zvol taskq context. This function + * is responsible for copying the request structure data in to the DMU and + * signaling the request queue with the result of the copy. */ -int -zvol_remove_minor(const char *name) +static void +zvol_write(void *arg) { - zvol_state_t *zv; - char namebuf[30]; - - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - - if (zv->zv_total_opens != 0) { - mutex_exit(&zvol_state_lock); - return (EBUSY); - } - - (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); - ddi_remove_minor_node(zfs_dip, namebuf); - - (void) sprintf(namebuf, "%uc", zv->zv_minor); - ddi_remove_minor_node(zfs_dip, namebuf); - - VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); - - zil_close(zv->zv_zilog); - zv->zv_zilog = NULL; - dmu_objset_close(zv->zv_objset); - zv->zv_objset = NULL; - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); - - ddi_soft_state_free(zvol_state, zv->zv_minor); + struct request *req = (struct request *)arg; + struct request_queue *q = req->q; + zvol_state_t *zv = q->queuedata; + uint64_t offset = blk_rq_pos(req) << 9; + uint64_t size = blk_rq_bytes(req); + int error = 0; + dmu_tx_t *tx; + rl_t *rl; - zvol_minors--; + /* + * Annotate this call path with a flag that indicates that it is + * unsafe to use KM_SLEEP during memory allocations due to the + * potential for a deadlock. KM_PUSHPAGE should be used instead. + */ + ASSERT(!(current->flags & PF_NOFS)); + current->flags |= PF_NOFS; - mutex_exit(&zvol_state_lock); + if (req->cmd_flags & VDEV_REQ_FLUSH) + zil_commit(zv->zv_zilog, ZVOL_OBJ); - return (0); -} - -int -zvol_prealloc(zvol_state_t *zv) -{ - objset_t *os = zv->zv_objset; - dmu_tx_t *tx; - uint64_t refd, avail, usedobjs, availobjs; - uint64_t resid = zv->zv_volsize; - uint64_t off = 0; - - /* Check the space usage before attempting to allocate the space */ - dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); - if (avail < zv->zv_volsize) - return (ENOSPC); - - /* Free old extents if they exist */ - zvol_free_extents(zv); - - while (resid != 0) { - int error; - uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); - return (error); - } - dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); - dmu_tx_commit(tx); - off += bytes; - resid -= bytes; + /* + * Some requests are just for flush and nothing else. + */ + if (size == 0) { + blk_end_request(req, 0, size); + goto out; } - txg_wait_synced(dmu_objset_pool(os), 0); - - return (0); -} -int -zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) -{ - dmu_tx_t *tx; - int error; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); + rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); + + /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - return (error); + zfs_range_unlock(rl); + blk_end_request(req, -error, size); + goto out; } - error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, - &volsize, tx); + error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); + if (error == 0) + zvol_log_write(zv, tx, offset, size, + req->cmd_flags & VDEV_REQ_FUA); + dmu_tx_commit(tx); + zfs_range_unlock(rl); - if (error == 0) - error = dmu_free_long_range(zv->zv_objset, - ZVOL_OBJ, volsize, DMU_OBJECT_END); + if ((req->cmd_flags & VDEV_REQ_FUA) || + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zv->zv_zilog, ZVOL_OBJ); - /* - * If we are using a faked-up state (zv_minor == 0) then don't - * try to update the in-core zvol state. - */ - if (error == 0 && zv->zv_minor) { - zv->zv_volsize = volsize; - zvol_size_changed(zv, maj); - } - return (error); + blk_end_request(req, -error, size); +out: + current->flags &= ~PF_NOFS; } -int -zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) +#ifdef HAVE_BLK_QUEUE_DISCARD +static void +zvol_discard(void *arg) { - zvol_state_t *zv; + struct request *req = (struct request *)arg; + struct request_queue *q = req->q; + zvol_state_t *zv = q->queuedata; + uint64_t start = blk_rq_pos(req) << 9; + uint64_t end = start + blk_rq_bytes(req); int error; - dmu_object_info_t doi; - uint64_t old_volsize = 0ULL; - zvol_state_t state = { 0 }; - - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - /* - * If we are doing a "zfs clone -o volsize=", then the - * minor node won't exist yet. - */ - error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER, - &state.zv_objset); - if (error != 0) - goto out; - zv = &state; - } - old_volsize = zv->zv_volsize; + rl_t *rl; - if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) - goto out; + /* + * Annotate this call path with a flag that indicates that it is + * unsafe to use KM_SLEEP during memory allocations due to the + * potential for a deadlock. KM_PUSHPAGE should be used instead. + */ + ASSERT(!(current->flags & PF_NOFS)); + current->flags |= PF_NOFS; - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { - error = EROFS; + if (end > zv->zv_volsize) { + blk_end_request(req, -EIO, blk_rq_bytes(req)); goto out; } - error = zvol_update_volsize(zv, maj, volsize); - /* - * Reinitialize the dump area to the new size. If we - * failed to resize the dump area then restore the it back to - * it's original size. + * Align the request to volume block boundaries. If we don't, + * then this will force dnode_free_range() to zero out the + * unaligned parts, which is slow (read-modify-write) and + * useless since we are not freeing any space by doing so. */ - if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) { - if ((error = zvol_dumpify(zv)) != 0 || - (error = dumpvp_resize()) != 0) { - (void) zvol_update_volsize(zv, maj, old_volsize); - error = zvol_dumpify(zv); - } - } + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); -out: - if (state.zv_objset) - dmu_objset_close(state.zv_objset); - - mutex_exit(&zvol_state_lock); + if (start >= end) { + blk_end_request(req, 0, blk_rq_bytes(req)); + goto out; + } - return (error); -} + rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); -int -zvol_set_volblocksize(const char *name, uint64_t volblocksize) -{ - zvol_state_t *zv; - dmu_tx_t *tx; - int error; - boolean_t needlock; + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); /* - * The lock may already be held if we are being called from - * zvol_dump_init(). + * TODO: maybe we should add the operation to the log. */ - needlock = !MUTEX_HELD(&zvol_state_lock); - if (needlock) - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - if (needlock) - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { - if (needlock) - mutex_exit(&zvol_state_lock); - return (EROFS); - } - - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, - volblocksize, 0, tx); - if (error == ENOTSUP) - error = EBUSY; - dmu_tx_commit(tx); - if (error == 0) - zv->zv_volblocksize = volblocksize; - } - if (needlock) - mutex_exit(&zvol_state_lock); + zfs_range_unlock(rl); - return (error); + blk_end_request(req, -error, blk_rq_bytes(req)); +out: + current->flags &= ~PF_NOFS; } +#endif /* HAVE_BLK_QUEUE_DISCARD */ -/*ARGSUSED*/ -int -zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) +/* + * Common read path running under the zvol taskq context. This function + * is responsible for copying the requested data out of the DMU and in to + * a linux request structure. It then must signal the request queue with + * an error code describing the result of the copy. + */ +static void +zvol_read(void *arg) { - minor_t minor = getminor(*devp); - zvol_state_t *zv; - - if (minor == 0) /* This is the control device */ - return (0); - - mutex_enter(&zvol_state_lock); + struct request *req = (struct request *)arg; + struct request_queue *q = req->q; + zvol_state_t *zv = q->queuedata; + uint64_t offset = blk_rq_pos(req) << 9; + uint64_t size = blk_rq_bytes(req); + int error; + rl_t *rl; - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); + if (size == 0) { + blk_end_request(req, 0, size); + return; } - ASSERT(zv->zv_objset != NULL); + rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - if ((flag & FWRITE) && - (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) { - mutex_exit(&zvol_state_lock); - return (EROFS); - } - if (zv->zv_flags & ZVOL_EXCL) { - mutex_exit(&zvol_state_lock); - return (EBUSY); - } - if (flag & FEXCL) { - if (zv->zv_total_opens != 0) { - mutex_exit(&zvol_state_lock); - return (EBUSY); - } - zv->zv_flags |= ZVOL_EXCL; - } + error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req); - if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { - zv->zv_open_count[otyp]++; - zv->zv_total_opens++; - } + zfs_range_unlock(rl); - mutex_exit(&zvol_state_lock); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; - return (0); + blk_end_request(req, -error, size); } -/*ARGSUSED*/ -int -zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) +/* + * Request will be added back to the request queue and retried if + * it cannot be immediately dispatched to the taskq for handling + */ +static inline void +zvol_dispatch(task_func_t func, struct request *req) { - minor_t minor = getminor(dev); - zvol_state_t *zv; - - if (minor == 0) /* This is the control device */ - return (0); - - mutex_enter(&zvol_state_lock); - - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); - } + if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP)) + blk_requeue_request(req->q, req); +} - if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_total_opens == 1); - zv->zv_flags &= ~ZVOL_EXCL; - } +/* + * Common request path. Rather than registering a custom make_request() + * function we use the generic Linux version. This is done because it allows + * us to easily merge read requests which would otherwise we performed + * synchronously by the DMU. This is less critical in write case where the + * DMU will perform the correct merging within a transaction group. Using + * the generic make_request() also let's use leverage the fact that the + * elevator with ensure correct ordering in regards to barrior IOs. On + * the downside it means that in the write case we end up doing request + * merging twice once in the elevator and once in the DMU. + * + * The request handler is called under a spin lock so all the real work + * is handed off to be done in the context of the zvol taskq. This function + * simply performs basic request sanity checking and hands off the request. + */ +static void +zvol_request(struct request_queue *q) +{ + zvol_state_t *zv = q->queuedata; + struct request *req; + unsigned int size; + + while ((req = blk_fetch_request(q)) != NULL) { + size = blk_rq_bytes(req); + + if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) > + get_capacity(zv->zv_disk)) { + printk(KERN_INFO + "%s: bad access: block=%llu, count=%lu\n", + req->rq_disk->disk_name, + (long long unsigned)blk_rq_pos(req), + (long unsigned)blk_rq_sectors(req)); + __blk_end_request(req, -EIO, size); + continue; + } - /* - * If the open count is zero, this is a spurious close. - * That indicates a bug in the kernel / DDI framework. - */ - ASSERT(zv->zv_open_count[otyp] != 0); - ASSERT(zv->zv_total_opens != 0); + if (!blk_fs_request(req)) { + printk(KERN_INFO "%s: non-fs cmd\n", + req->rq_disk->disk_name); + __blk_end_request(req, -EIO, size); + continue; + } - /* - * You may get multiple opens, but only one close. - */ - zv->zv_open_count[otyp]--; - zv->zv_total_opens--; + switch (rq_data_dir(req)) { + case READ: + zvol_dispatch(zvol_read, req); + break; + case WRITE: + if (unlikely(get_disk_ro(zv->zv_disk)) || + unlikely(zv->zv_flags & ZVOL_RDONLY)) { + __blk_end_request(req, -EROFS, size); + break; + } - mutex_exit(&zvol_state_lock); +#ifdef HAVE_BLK_QUEUE_DISCARD + if (req->cmd_flags & VDEV_REQ_DISCARD) { + zvol_dispatch(zvol_discard, req); + break; + } +#endif /* HAVE_BLK_QUEUE_DISCARD */ - return (0); + zvol_dispatch(zvol_write, req); + break; + default: + printk(KERN_INFO "%s: unknown cmd: %d\n", + req->rq_disk->disk_name, (int)rq_data_dir(req)); + __blk_end_request(req, -EIO, size); + break; + } + } } static void -zvol_get_done(dmu_buf_t *db, void *vzgd) +zvol_get_done(zgd_t *zgd, int error) { - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_range_unlock(zgd->zgd_rl); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } @@ -906,15 +808,18 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zvol_state_t *zv = arg; objset_t *os = zv->zv_objset; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; dmu_buf_t *db; - rl_t *rl; zgd_t *zgd; - uint64_t boff; /* block starting offset */ - int dlen = lr->lr_length; /* length of user data */ int error; - ASSERT(zio); - ASSERT(dlen != 0); + ASSERT(zio != NULL); + ASSERT(size != 0); + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE); + zgd->zgd_zilog = zv->zv_zilog; + zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); /* * Write records come in two flavors: immediate and indirect. @@ -923,808 +828,796 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ - if (buf != NULL) /* immediate write */ - return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); + if (buf != NULL) { /* immediate write */ + error = dmu_read(os, ZVOL_OBJ, offset, size, buf, + DMU_READ_NO_PREFETCH); + } else { + size = zv->zv_volblocksize; + offset = P2ALIGN_TYPED(offset, size, uint64_t); + error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = &lr->lr_blkptr; - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_bp = &lr->lr_blkptr; + ASSERT(db != NULL); + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zvol_get_done, zgd); + + if (error == 0) + return (0); + } + } + + zvol_get_done(zgd, error); - /* - * Lock the range of the block to ensure that when the data is - * written out and its checksum is being calculated that no other - * thread can change the block. - */ - boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); - rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, - RL_READER); - zgd->zgd_rl = rl; - - VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zvol_get_done, zgd); - if (error == 0) - zil_add_block(zv->zv_zilog, &lr->lr_blkptr); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zvol_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - zfs_range_unlock(rl); - kmem_free(zgd, sizeof (zgd_t)); return (error); } /* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). + * The zvol_state_t's are inserted in increasing MINOR(dev_t) order. */ -ssize_t zvol_immediate_write_sz = 32768; - static void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) +zvol_insert(zvol_state_t *zv_insert) { - uint32_t blocksize = zv->zv_volblocksize; - zilog_t *zilog = zv->zv_zilog; - lr_write_t *lr; + zvol_state_t *zv = NULL; - if (zilog->zl_replay) { - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = - zilog->zl_replaying_seq; - return; + ASSERT(MUTEX_HELD(&zvol_state_lock)); + ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) + break; } - while (len) { - ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); - itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - - itx->itx_wr_state = - len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; - itx->itx_private = zv; - lr = (lr_write_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = nbytes; - lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); - BP_ZERO(&lr->lr_blkptr); + list_insert_before(&zvol_state_list, zv, zv_insert); +} - (void) zil_itx_assign(zilog, itx, tx); - len -= nbytes; - off += nbytes; - } +/* + * Simply remove the zvol from to list of zvols. + */ +static void +zvol_remove(zvol_state_t *zv_remove) +{ + ASSERT(MUTEX_HELD(&zvol_state_lock)); + list_remove(&zvol_state_list, zv_remove); } static int -zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, - boolean_t doread, boolean_t isdump) +zvol_first_open(zvol_state_t *zv) { - vdev_disk_t *dvd; - int c; - int numerrors = 0; - - for (c = 0; c < vd->vdev_children; c++) { - ASSERT(vd->vdev_ops == &vdev_mirror_ops); - int err = zvol_dumpio_vdev(vd->vdev_child[c], - addr, offset, size, doread, isdump); - if (err != 0) { - numerrors++; - } else if (doread) { - break; - } + objset_t *os; + uint64_t volsize; + int locked = 0; + int error; + uint64_t ro; + + /* + * In all other cases the spa_namespace_lock is taken before the + * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() + * function calls fops->open() with the bdev->bd_mutex lock held. + * + * To avoid a potential lock inversion deadlock we preemptively + * try to take the spa_namespace_lock(). Normally it will not + * be contended and this is safe because spa_open_common() handles + * the case where the caller already holds the spa_namespace_lock. + * + * When it is contended we risk a lock inversion if we were to + * block waiting for the lock. Luckily, the __blkdev_get() + * function allows us to return -ERESTARTSYS which will result in + * bdev->bd_mutex being dropped, reacquired, and fops->open() being + * called again. This process can be repeated safely until both + * locks are acquired. + */ + if (!mutex_owned(&spa_namespace_lock)) { + locked = mutex_tryenter(&spa_namespace_lock); + if (!locked) + return (-ERESTARTSYS); } - if (!vd->vdev_ops->vdev_op_leaf) - return (numerrors < vd->vdev_children ? 0 : EIO); + /* lie and say we're read-only */ + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); + if (error) + goto out_mutex; - if (doread && !vdev_readable(vd)) - return (EIO); - else if (!doread && !vdev_writeable(vd)) - return (EIO); + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) { + dmu_objset_disown(os, zvol_tag); + goto out_mutex; + } - dvd = vd->vdev_tsd; - ASSERT3P(dvd, !=, NULL); - offset += VDEV_LABEL_START_SIZE; + zv->zv_objset = os; + error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); + if (error) { + dmu_objset_disown(os, zvol_tag); + goto out_mutex; + } + + set_capacity(zv->zv_disk, volsize >> 9); + zv->zv_volsize = volsize; + zv->zv_zilog = zil_open(os, zvol_get_data); - if (ddi_in_panic() || isdump) { - ASSERT(!doread); - if (doread) - return (EIO); - return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), - lbtodb(size))); + VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0); + if (ro || dmu_objset_is_snapshot(os) || + !spa_writeable(dmu_objset_spa(os))) { + set_disk_ro(zv->zv_disk, 1); + zv->zv_flags |= ZVOL_RDONLY; } else { - return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, - doread ? B_READ : B_WRITE)); + set_disk_ro(zv->zv_disk, 0); + zv->zv_flags &= ~ZVOL_RDONLY; } + +out_mutex: + if (locked) + mutex_exit(&spa_namespace_lock); + + return (-error); +} + +static void +zvol_last_close(zvol_state_t *zv) +{ + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; + + dmu_buf_rele(zv->zv_dbuf, zvol_tag); + zv->zv_dbuf = NULL; + + /* + * Evict cached data + */ + if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && + !(zv->zv_flags & ZVOL_RDONLY)) + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + (void) dmu_objset_evict_dbufs(zv->zv_objset); + + dmu_objset_disown(zv->zv_objset, zvol_tag); + zv->zv_objset = NULL; } static int -zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, - boolean_t doread, boolean_t isdump) +zvol_open(struct block_device *bdev, fmode_t flag) { - vdev_t *vd; - int error; - zvol_extent_t *ze; - spa_t *spa = dmu_objset_spa(zv->zv_objset); + zvol_state_t *zv = bdev->bd_disk->private_data; + int error = 0, drop_mutex = 0; - /* Must be sector aligned, and not stradle a block boundary. */ - if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || - P2BOUNDARY(offset, size, zv->zv_volblocksize)) { - return (EINVAL); + /* + * If the caller is already holding the mutex do not take it + * again, this will happen as part of zvol_create_minor(). + * Once add_disk() is called the device is live and the kernel + * will attempt to open it to read the partition information. + */ + if (!mutex_owned(&zvol_state_lock)) { + mutex_enter(&zvol_state_lock); + drop_mutex = 1; } - ASSERT(size <= zv->zv_volblocksize); - /* Locate the extent this belongs to */ - ze = list_head(&zv->zv_extents); - while (offset >= ze->ze_nblks * zv->zv_volblocksize) { - offset -= ze->ze_nblks * zv->zv_volblocksize; - ze = list_next(&zv->zv_extents, ze); + ASSERT3P(zv, !=, NULL); + + if (zv->zv_open_count == 0) { + error = zvol_first_open(zv); + if (error) + goto out_mutex; + } + + if ((flag & FMODE_WRITE) && + (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY))) { + error = -EROFS; + goto out_open_count; } - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); - offset += DVA_GET_OFFSET(&ze->ze_dva); - error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump); - spa_config_exit(spa, SCL_STATE, FTAG); + + zv->zv_open_count++; + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); + +out_mutex: + if (drop_mutex) + mutex_exit(&zvol_state_lock); + + check_disk_change(bdev); + return (error); } -int -zvol_strategy(buf_t *bp) +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID +static void +#else +static int +#endif +zvol_release(struct gendisk *disk, fmode_t mode) { - zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); - uint64_t off, volsize; - size_t resid; - char *addr; - objset_t *os; - rl_t *rl; - int error = 0; - boolean_t doread = bp->b_flags & B_READ; - boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED; + zvol_state_t *zv = disk->private_data; + int drop_mutex = 0; - if (zv == NULL) { - bioerror(bp, ENXIO); - biodone(bp); - return (0); + if (!mutex_owned(&zvol_state_lock)) { + mutex_enter(&zvol_state_lock); + drop_mutex = 1; } - if (getminor(bp->b_edev) == 0) { - bioerror(bp, EINVAL); - biodone(bp); - return (0); - } + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + zv->zv_open_count--; + if (zv->zv_open_count == 0) + zvol_last_close(zv); - if (!(bp->b_flags & B_READ) && - (zv->zv_flags & ZVOL_RDONLY || - zv->zv_mode & DS_MODE_READONLY)) { - bioerror(bp, EROFS); - biodone(bp); - return (0); - } + if (drop_mutex) + mutex_exit(&zvol_state_lock); - off = ldbtob(bp->b_blkno); - volsize = zv->zv_volsize; +#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + return (0); +#endif +} - os = zv->zv_objset; - ASSERT(os != NULL); +static int +zvol_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + int error = 0; - bp_mapin(bp); - addr = bp->b_un.b_addr; - resid = bp->b_bcount; + if (zv == NULL) + return (-ENXIO); - if (resid > 0 && (off < 0 || off >= volsize)) { - bioerror(bp, EIO); - biodone(bp); - return (0); - } + switch (cmd) { + case BLKFLSBUF: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + break; + case BLKZNAME: + error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); + break; + + default: + error = -ENOTTY; + break; - /* - * There must be no buffer changes when doing a dmu_sync() because - * we can't change the data whilst calculating the checksum. - */ - rl = zfs_range_lock(&zv->zv_znode, off, resid, - doread ? RL_READER : RL_WRITER); - - while (resid != 0 && off < volsize) { - size_t size = MIN(resid, zvol_maxphys); - if (is_dump) { - size = MIN(size, P2END(off, zv->zv_volblocksize) - off); - error = zvol_dumpio(zv, addr, off, size, - doread, B_FALSE); - } else if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr); - } else { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size); - dmu_tx_commit(tx); - } - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = EIO; - break; - } - off += size; - addr += size; - resid -= size; } - zfs_range_unlock(rl); - if ((bp->b_resid = resid) == bp->b_bcount) - bioerror(bp, off > volsize ? EINVAL : error); + return (error); +} - if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump) - zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); - biodone(bp); +#ifdef CONFIG_COMPAT +static int +zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + return zvol_ioctl(bdev, mode, cmd, arg); +} +#else +#define zvol_compat_ioctl NULL +#endif - return (0); +static int zvol_media_changed(struct gendisk *disk) +{ + zvol_state_t *zv = disk->private_data; + + return zv->zv_changed; +} + +static int zvol_revalidate_disk(struct gendisk *disk) +{ + zvol_state_t *zv = disk->private_data; + + zv->zv_changed = 0; + set_capacity(zv->zv_disk, zv->zv_volsize >> 9); + + return 0; } /* - * Set the buffer count to the zvol maximum transfer. - * Using our own routine instead of the default minphys() - * means that for larger writes we write bigger buffers on X86 - * (128K instead of 56K) and flush the disk write cache less often - * (every zvol_maxphys - currently 1MB) instead of minphys (currently - * 56K on X86 and 128K on sparc). + * Provide a simple virtual geometry for legacy compatibility. For devices + * smaller than 1 MiB a small head and sector count is used to allow very + * tiny devices. For devices over 1 Mib a standard head and sector count + * is used to keep the cylinders count reasonable. */ -void -zvol_minphys(struct buf *bp) +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - if (bp->b_bcount > zvol_maxphys) - bp->b_bcount = zvol_maxphys; + zvol_state_t *zv = bdev->bd_disk->private_data; + sector_t sectors = get_capacity(zv->zv_disk); + + if (sectors > 2048) { + geo->heads = 16; + geo->sectors = 63; + } else { + geo->heads = 2; + geo->sectors = 4; + } + + geo->start = 0; + geo->cylinders = sectors / (geo->heads * geo->sectors); + + return 0; } -int -zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) +static struct kobject * +zvol_probe(dev_t dev, int *part, void *arg) { - minor_t minor = getminor(dev); zvol_state_t *zv; - int error = 0; - uint64_t size; - uint64_t boff; - uint64_t resid; + struct kobject *kobj; - if (minor == 0) /* This is the control device */ - return (ENXIO); + mutex_enter(&zvol_state_lock); + zv = zvol_find_by_dev(dev); + kobj = zv ? get_disk(zv->zv_disk) : NULL; + mutex_exit(&zvol_state_lock); - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - return (ENXIO); + return kobj; +} - boff = ldbtob(blkno); - resid = ldbtob(nblocks); +#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS +static struct block_device_operations zvol_ops = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .media_changed = zvol_media_changed, + .revalidate_disk = zvol_revalidate_disk, + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; - VERIFY3U(boff + resid, <=, zv->zv_volsize); +#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ - while (resid) { - size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); - error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); - if (error) - break; - boff += size; - addr += size; - resid -= size; - } +static int +zvol_open_by_inode(struct inode *inode, struct file *file) +{ + return zvol_open(inode->i_bdev, file->f_mode); +} + +static int +zvol_release_by_inode(struct inode *inode, struct file *file) +{ + return zvol_release(inode->i_bdev->bd_disk, file->f_mode); +} - return (error); +static int +zvol_ioctl_by_inode(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + if (file == NULL || inode == NULL) + return -EINVAL; + return zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg); } -/*ARGSUSED*/ -int -zvol_read(dev_t dev, uio_t *uio, cred_t *cr) +# ifdef CONFIG_COMPAT +static long +zvol_compat_ioctl_by_inode(struct file *file, + unsigned int cmd, unsigned long arg) +{ + if (file == NULL) + return -EINVAL; + return zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, + file->f_mode, cmd, arg); +} +# else +# define zvol_compat_ioctl_by_inode NULL +# endif + +static struct block_device_operations zvol_ops = { + .open = zvol_open_by_inode, + .release = zvol_release_by_inode, + .ioctl = zvol_ioctl_by_inode, + .compat_ioctl = zvol_compat_ioctl_by_inode, + .media_changed = zvol_media_changed, + .revalidate_disk = zvol_revalidate_disk, + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; +#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static zvol_state_t * +zvol_alloc(dev_t dev, const char *name) { - minor_t minor = getminor(dev); zvol_state_t *zv; - uint64_t volsize; - rl_t *rl; int error = 0; - if (minor == 0) /* This is the control device */ - return (ENXIO); + zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - return (ENXIO); + spin_lock_init(&zv->zv_lock); + list_link_init(&zv->zv_next); - volsize = zv->zv_volsize; - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) - return (EIO); + zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock); + if (zv->zv_queue == NULL) + goto out_kmem; - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_READ, - zvol_minphys, uio); - return (error); +#ifdef HAVE_ELEVATOR_CHANGE + error = elevator_change(zv->zv_queue, "noop"); +#endif /* HAVE_ELEVATOR_CHANGE */ + if (error) { + printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n", + "noop", name, error); + goto out_queue; } - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); +#ifdef HAVE_BLK_QUEUE_FLUSH + blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA); +#else + blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL); +#endif /* HAVE_BLK_QUEUE_FLUSH */ - /* don't read past the end */ - if (bytes > volsize - uio->uio_loffset) - bytes = volsize - uio->uio_loffset; - - error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = EIO; - break; - } - } - zfs_range_unlock(rl); - return (error); -} + zv->zv_disk = alloc_disk(ZVOL_MINORS); + if (zv->zv_disk == NULL) + goto out_queue; -/*ARGSUSED*/ -int -zvol_write(dev_t dev, uio_t *uio, cred_t *cr) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - uint64_t volsize; - rl_t *rl; - int error = 0; + zv->zv_queue->queuedata = zv; + zv->zv_dev = dev; + zv->zv_open_count = 0; + strlcpy(zv->zv_name, name, MAXNAMELEN); - if (minor == 0) /* This is the control device */ - return (ENXIO); + mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + zv->zv_znode.z_is_zvol = TRUE; - zv = ddi_get_soft_state(zvol_state, minor); - if (zv == NULL) - return (ENXIO); + zv->zv_disk->major = zvol_major; + zv->zv_disk->first_minor = (dev & MINORMASK); + zv->zv_disk->fops = &zvol_ops; + zv->zv_disk->private_data = zv; + zv->zv_disk->queue = zv->zv_queue; + snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d", + ZVOL_DEV_NAME, (dev & MINORMASK)); - volsize = zv->zv_volsize; - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) - return (EIO); + return zv; - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_WRITE, - zvol_minphys, uio); - return (error); - } +out_queue: + blk_cleanup_queue(zv->zv_queue); +out_kmem: + kmem_free(zv, sizeof (zvol_state_t)); - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio->uio_loffset; - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + return NULL; +} - if (bytes > volsize - off) /* don't write past the end */ - bytes = volsize - off; +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + */ +static void +zvol_free(zvol_state_t *zv) +{ + avl_destroy(&zv->zv_znode.z_range_avl); + mutex_destroy(&zv->zv_znode.z_range_lock); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - break; - } - error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); - if (error == 0) - zvol_log_write(zv, tx, off, bytes); - dmu_tx_commit(tx); + del_gendisk(zv->zv_disk); + blk_cleanup_queue(zv->zv_queue); + put_disk(zv->zv_disk); - if (error) - break; - } - zfs_range_unlock(rl); - return (error); + kmem_free(zv, sizeof (zvol_state_t)); } -int -zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) +static int +__zvol_snapdev_hidden(const char *name) { - struct uuid uuid = EFI_RESERVED; - efi_gpe_t gpe = { 0 }; - uint32_t crc; - dk_efi_t efi; - int length; - char *ptr; - - if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag)) - return (EFAULT); - ptr = (char *)(uintptr_t)efi.dki_data_64; - length = efi.dki_length; - /* - * Some clients may attempt to request a PMBR for the - * zvol. Currently this interface will return EINVAL to - * such requests. These requests could be supported by - * adding a check for lba == 0 and consing up an appropriate - * PMBR. - */ - if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0) - return (EINVAL); - - gpe.efi_gpe_StartingLBA = LE_64(34ULL); - gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1); - UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); - - if (efi.dki_lba == 1) { - efi_gpt_t gpt = { 0 }; - - gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); - gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); - gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); - gpt.efi_gpt_MyLBA = LE_64(1ULL); - gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); - gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); - gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); - gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); - gpt.efi_gpt_SizeOfPartitionEntry = - LE_32(sizeof (efi_gpe_t)); - CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); - gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); - CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); - gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); - if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), - flag)) - return (EFAULT); - ptr += sizeof (gpt); - length -= sizeof (gpt); - } - if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe), - length), flag)) - return (EFAULT); - return (0); + uint64_t snapdev; + char *parent; + char *atp; + int error = 0; + + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strlcpy(parent, name, MAXPATHLEN); + + if ((atp = strrchr(parent, '@')) != NULL) { + *atp = '\0'; + error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); + if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) + error = ENODEV; + } + kmem_free(parent, MAXPATHLEN); + return (error); } -/* - * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). - */ -/*ARGSUSED*/ -int -zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +static int +__zvol_create_minor(const char *name, boolean_t ignore_snapdev) { zvol_state_t *zv; - struct dk_cinfo dki; - struct dk_minfo dkm; - struct dk_callback *dkc; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + unsigned minor = 0; int error = 0; - rl_t *rl; - mutex_enter(&zvol_state_lock); + ASSERT(MUTEX_HELD(&zvol_state_lock)); - zv = ddi_get_soft_state(zvol_state, getminor(dev)); + zv = zvol_find_by_name(name); + if (zv) { + error = EEXIST; + goto out; + } - if (zv == NULL) { - mutex_exit(&zvol_state_lock); - return (ENXIO); + if (ignore_snapdev == B_FALSE) { + error = __zvol_snapdev_hidden(name); + if (error) + goto out; } - switch (cmd) { + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); - case DKIOCINFO: - bzero(&dki, sizeof (dki)); - (void) strcpy(dki.dki_cname, "zvol"); - (void) strcpy(dki.dki_dname, "zvol"); - dki.dki_ctype = DKC_UNKNOWN; - dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); - mutex_exit(&zvol_state_lock); - if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) - error = EFAULT; - return (error); + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); + if (error) + goto out_doi; - case DKIOCGMEDIAINFO: - bzero(&dkm, sizeof (dkm)); - dkm.dki_lbsize = 1U << zv->zv_min_bs; - dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; - dkm.dki_media_type = DK_UNKNOWN; - mutex_exit(&zvol_state_lock); - if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) - error = EFAULT; - return (error); + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; - case DKIOCGETEFI: - { - uint64_t vs = zv->zv_volsize; - uint8_t bs = zv->zv_min_bs; + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; - mutex_exit(&zvol_state_lock); - error = zvol_getefi((void *)arg, flag, vs, bs); - return (error); - } + error = zvol_find_minor(&minor); + if (error) + goto out_dmu_objset_disown; - case DKIOCFLUSHWRITECACHE: - dkc = (struct dk_callback *)arg; - zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); - if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { - (*dkc->dkc_callback)(dkc->dkc_cookie, error); - error = 0; - } - break; + zv = zvol_alloc(MKDEV(zvol_major, minor), name); + if (zv == NULL) { + error = EAGAIN; + goto out_dmu_objset_disown; + } - case DKIOCGGEOM: - case DKIOCGVTOC: - /* - * commands using these (like prtvtoc) expect ENOTSUP - * since we're emulating an EFI label - */ - error = ENOTSUP; - break; + if (dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; - case DKIOCDUMPINIT: - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, - RL_WRITER); - error = zvol_dumpify(zv); - zfs_range_unlock(rl); - break; + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; - case DKIOCDUMPFINI: - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, - RL_WRITER); - error = zvol_dump_fini(zv); - zfs_range_unlock(rl); - break; + set_capacity(zv->zv_disk, zv->zv_volsize >> 9); + + blk_queue_max_hw_sectors(zv->zv_queue, UINT_MAX); + blk_queue_max_segments(zv->zv_queue, UINT16_MAX); + blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); + blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); + blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); +#ifdef HAVE_BLK_QUEUE_DISCARD + blk_queue_max_discard_sectors(zv->zv_queue, + (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); + blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); +#endif +#ifdef HAVE_BLK_QUEUE_NONROT + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); +#endif - default: - error = ENOTTY; - break; + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, zvol_tag); +out_doi: + kmem_free(doi, sizeof(dmu_object_info_t)); +out: + if (error == 0) { + zvol_insert(zv); + add_disk(zv->zv_disk); } - mutex_exit(&zvol_state_lock); + return (error); } +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ int -zvol_busy(void) +zvol_create_minor(const char *name) { - return (zvol_minors != 0); -} + int error; -void -zvol_init(void) -{ - VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_enter(&zvol_state_lock); + error = __zvol_create_minor(name, B_FALSE); + mutex_exit(&zvol_state_lock); + + return (error); } -void -zvol_fini(void) +static int +__zvol_remove_minor(const char *name) { - mutex_destroy(&zvol_state_lock); - ddi_soft_state_fini(&zvol_state); + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + zv = zvol_find_by_name(name); + if (zv == NULL) + return (ENXIO); + + if (zv->zv_open_count > 0) + return (EBUSY); + + zvol_remove(zv); + zvol_free(zv); + + return (0); } -static boolean_t -zvol_is_swap(zvol_state_t *zv) +/* + * Remove a block device minor node for the specified volume. + */ +int +zvol_remove_minor(const char *name) { - vnode_t *vp; - boolean_t ret = B_FALSE; - char *devpath; - size_t devpathlen; int error; - devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1; - devpath = kmem_alloc(devpathlen, KM_SLEEP); - (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name); - error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); - kmem_free(devpath, devpathlen); + mutex_enter(&zvol_state_lock); + error = __zvol_remove_minor(name); + mutex_exit(&zvol_state_lock); - ret = !error && IS_SWAPVP(common_specvp(vp)); + return (error); +} - if (vp != NULL) - VN_RELE(vp); +static int +zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, + const char *dsname, void *arg) +{ + if (strchr(dsname, '/') == NULL) + return 0; - return (ret); + (void) __zvol_create_minor(dsname, B_FALSE); + return (0); } -static int -zvol_dump_init(zvol_state_t *zv, boolean_t resize) +/* + * Create minors for specified pool, if pool is NULL create minors + * for all available pools. + */ +int +zvol_create_minors(const char *pool) { - dmu_tx_t *tx; + spa_t *spa = NULL; int error = 0; - objset_t *os = zv->zv_objset; - nvlist_t *nv = NULL; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } + if (zvol_inhibit_dev) + return (0); - /* - * If we are resizing the dump device then we only need to - * update the refreservation to match the newly updated - * zvolsize. Otherwise, we save off the original state of the - * zvol so that we can restore them if the zvol is ever undumpified. - */ - if (resize) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &zv->zv_volsize, tx); + mutex_enter(&zvol_state_lock); + if (pool) { + error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, + NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); } else { - uint64_t checksum, compress, refresrv, vbs; - - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); - - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, - &compress, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &refresrv, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, - &vbs, tx); + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + error = dmu_objset_find_spa(NULL, + spa_name(spa), zvol_create_minors_cb, NULL, + DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (error) + break; + } + mutex_exit(&spa_namespace_lock); } - dmu_tx_commit(tx); + mutex_exit(&zvol_state_lock); - /* Truncate the file */ - if (!error) - error = dmu_free_long_range(zv->zv_objset, - ZVOL_OBJ, 0, DMU_OBJECT_END); + return error; +} - if (error) - return (error); +/* + * Remove minors for specified pool, if pool is NULL remove all minors. + */ +void +zvol_remove_minors(const char *pool) +{ + zvol_state_t *zv, *zv_next; + char *str; - /* - * We only need update the zvol's property if we are initializing - * the dump area for the first time. - */ - if (!resize) { - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), - ZIO_COMPRESS_OFF) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), - ZIO_CHECKSUM_OFF) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - SPA_MAXBLOCKSIZE) == 0); - - error = zfs_set_prop_nvlist(zv->zv_name, nv); - nvlist_free(nv); + if (zvol_inhibit_dev) + return; - if (error) - return (error); + str = kmem_zalloc(MAXNAMELEN, KM_SLEEP); + if (pool) { + (void) strncpy(str, pool, strlen(pool)); + (void) strcat(str, "/"); } - /* Allocate the space for the dump */ - error = zvol_prealloc(zv); - return (error); + mutex_enter(&zvol_state_lock); + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + if (pool == NULL || !strncmp(str, zv->zv_name, strlen(str))) { + zvol_remove(zv); + zvol_free(zv); + } + } + mutex_exit(&zvol_state_lock); + kmem_free(str, MAXNAMELEN); } static int -zvol_dumpify(zvol_state_t *zv) -{ - int error = 0; - uint64_t dumpsize = 0; - dmu_tx_t *tx; - objset_t *os = zv->zv_objset; +snapdev_snapshot_changed_cb(const char *dsname, void *arg) { + uint64_t snapdev = *(uint64_t *) arg; - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) - return (EROFS); + if (strchr(dsname, '@') == NULL) + return 0; - /* - * We do not support swap devices acting as dump devices. - */ - if (zvol_is_swap(zv)) - return (ENOTSUP); + switch (snapdev) { + case ZFS_SNAPDEV_VISIBLE: + mutex_enter(&zvol_state_lock); + (void) __zvol_create_minor(dsname, B_TRUE); + mutex_exit(&zvol_state_lock); + break; + case ZFS_SNAPDEV_HIDDEN: + (void) zvol_remove_minor(dsname); + break; + } + return 0; +} - if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, - 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { - boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; +int +zvol_set_snapdev(const char *dsname, uint64_t snapdev) { + (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb, + &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + /* caller should continue to modify snapdev property */ + return (-1); +} - if ((error = zvol_dump_init(zv, resize)) != 0) { - (void) zvol_dump_fini(zv); - return (error); - } - } - /* - * Build up our lba mapping. - */ - error = zvol_get_lbas(zv); - if (error) { - (void) zvol_dump_fini(zv); - return (error); - } +int +zvol_init(void) +{ + int error; - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - (void) zvol_dump_fini(zv); - return (error); - } + list_create(&zvol_state_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); + mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); - zv->zv_flags |= ZVOL_DUMPIFIED; - error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, - &zv->zv_volsize, tx); - dmu_tx_commit(tx); + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri, + zvol_threads, INT_MAX, TASKQ_PREPOPULATE); + if (zvol_taskq == NULL) { + printk(KERN_INFO "ZFS: taskq_create() failed\n"); + error = -ENOMEM; + goto out1; + } + error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { - (void) zvol_dump_fini(zv); - return (error); + printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); + goto out2; } - txg_wait_synced(dmu_objset_pool(os), 0); + blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, + THIS_MODULE, zvol_probe, NULL, NULL); + return (0); + +out2: + taskq_destroy(zvol_taskq); +out1: + mutex_destroy(&zvol_state_lock); + list_destroy(&zvol_state_list); + + return (error); } -static int -zvol_dump_fini(zvol_state_t *zv) +void +zvol_fini(void) { - dmu_tx_t *tx; - objset_t *os = zv->zv_objset; - nvlist_t *nv; - int error = 0; - uint64_t checksum, compress, refresrv, vbs; + zvol_remove_minors(NULL); + blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + taskq_destroy(zvol_taskq); + mutex_destroy(&zvol_state_lock); + list_destroy(&zvol_state_list); +} - /* - * Attempt to restore the zvol back to its pre-dumpified state. - * This is a best-effort attempt as it's possible that not all - * of these properties were initialized during the dumpify process - * (i.e. error during zvol_dump_init). - */ +module_param(zvol_inhibit_dev, uint, 0644); +MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); - dmu_tx_commit(tx); +module_param(zvol_major, uint, 0444); +MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs); - (void) zfs_set_prop_nvlist(zv->zv_name, nv); - nvlist_free(nv); - - zvol_free_extents(zv); - zv->zv_flags &= ~ZVOL_DUMPIFIED; - (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); +module_param(zvol_threads, uint, 0444); +MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device"); - return (0); -} +module_param(zvol_max_discard_blocks, ulong, 0444); +MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once");