X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzfs_vfsops.c;h=14411b8c4cfa95bce4501e95cb1f35a03f4437fd;hb=b516a07b997a8c3006788963f6b6128e505cf56c;hp=1bf1bc527b7c51d1bb80c311515a24eb932b0668;hpb=fb5f0bc83330c8a0236c4d34a23723ac1974971a;p=zfs.git diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 1bf1bc5..14411b8 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -60,65 +62,10 @@ #include #include #include +#include +#include "zfs_comutil.h" -int zfsfstype; -vfsops_t *zfs_vfsops = NULL; -static major_t zfs_major; -static minor_t zfs_minor; -static kmutex_t zfs_dev_mtx; - -static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); -static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); -static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); -static int zfs_root(vfs_t *vfsp, vnode_t **vpp); -static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); -static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); -static void zfs_freevfs(vfs_t *vfsp); - -static const fs_operation_def_t zfs_vfsops_template[] = { - VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, - VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, - VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, - VFSNAME_ROOT, { .vfs_root = zfs_root }, - VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, - VFSNAME_SYNC, { .vfs_sync = zfs_sync }, - VFSNAME_VGET, { .vfs_vget = zfs_vget }, - VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, - NULL, NULL -}; - -static const fs_operation_def_t zfs_vfsops_eio_template[] = { - VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, - NULL, NULL -}; - -/* - * We need to keep a count of active fs's. - * This is necessary to prevent our module - * from being unloaded after a umount -f - */ -static uint32_t zfs_active_fs_count = 0; - -static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; -static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; -static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; -static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; - -/* - * MO_DEFAULT is not used since the default value is determined - * by the equivalent property. - */ -static mntopt_t mntopts[] = { - { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, - { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, - { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, - { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } -}; - -static mntopts_t zfs_mntopts = { - sizeof (mntopts) / sizeof (mntopt_t), - mntopts -}; +#ifdef HAVE_ZPL /*ARGSUSED*/ int @@ -128,16 +75,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ - if (panicstr) - return (0); - - /* - * SYNC_ATTR is used by fsflush() to force old filesystems like UFS - * to sync metadata, which they would otherwise cache indefinitely. - * Semantically, the only requirement is that the sync be initiated. - * The DMU syncs out txgs frequently, so there's nothing to do. - */ - if (flag & SYNC_ATTR) + if (unlikely(oops_in_progress)) return (0); if (vfsp != NULL) { @@ -145,12 +83,28 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + +#ifdef HAVE_SHUTDOWN + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + * + * XXX: This can be implemented using the Linux reboot + * notifiers: {un}register_reboot_notifier(). + */ + if (sys_shutdown && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } +#endif /* HAVE_SHUTDOWN */ + if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); - else - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + zil_commit(zfsvfs->z_log, 0); + ZFS_EXIT(zfsvfs); } else { /* @@ -163,58 +117,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) return (0); } - -static int -zfs_create_unique_device(dev_t *dev) -{ - major_t new_major; - - do { - ASSERT3U(zfs_minor, <=, MAXMIN32); - minor_t start = zfs_minor; - do { - mutex_enter(&zfs_dev_mtx); - if (zfs_minor >= MAXMIN32) { - /* - * If we're still using the real major - * keep out of /dev/zfs and /dev/zvol minor - * number space. If we're using a getudev()'ed - * major number, we can use all of its minors. - */ - if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) - zfs_minor = ZFS_MIN_MINOR; - else - zfs_minor = 0; - } else { - zfs_minor++; - } - *dev = makedevice(zfs_major, zfs_minor); - mutex_exit(&zfs_dev_mtx); - } while (vfs_devismounted(*dev) && zfs_minor != start); - if (zfs_minor == start) { - /* - * We are using all ~262,000 minor numbers for the - * current major number. Create a new major number. - */ - if ((new_major = getudev()) == (major_t)-1) { - cmn_err(CE_WARN, - "zfs_mount: Can't get unique major " - "device number."); - return (-1); - } - mutex_enter(&zfs_dev_mtx); - zfs_major = new_major; - zfs_minor = 0; - - mutex_exit(&zfs_dev_mtx); - } else { - break; - } - /* CONSTANTCONDITION */ - } while (1); - - return (0); -} +EXPORT_SYMBOL(zfs_sync); static void atime_changed_cb(void *arg, uint64_t newval) @@ -367,14 +270,6 @@ vscan_changed_cb(void *arg, uint64_t newval) } static void -acl_mode_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_mode = newval; -} - -static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; @@ -382,7 +277,7 @@ acl_inherit_changed_cb(void *arg, uint64_t newval) zfsvfs->z_acl_inherit = newval; } -static int +int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; @@ -408,7 +303,8 @@ zfs_register_callbacks(vfs_t *vfsp) * of mount options, we stash away the current values and * restore them after we register the callbacks. */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { @@ -474,8 +370,8 @@ zfs_register_callbacks(vfs_t *vfsp) char osname[MAXNAMELEN]; dmu_objset_name(os, osname); - if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, - NULL)) { + if ((error = dsl_prop_get_integer(osname, "nbmand", &nbmand, + NULL))) { return (error); } } @@ -504,8 +400,6 @@ zfs_register_callbacks(vfs_t *vfsp) error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "vscan", vscan_changed_cb, zfsvfs); @@ -546,13 +440,436 @@ unregister: (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); return (error); } +EXPORT_SYMBOL(zfs_register_callbacks); +#endif /* HAVE_ZPL */ + +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) +{ + znode_phys_t *znp = data; + int error = 0; + + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); + + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (EEXIST); + + if (bonustype == DMU_OT_ZNODE) { + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + } else { + int hdrsize; + + ASSERT(bonustype == DMU_OT_SA); + hdrsize = sa_hdrsize(data); + + if (hdrsize != 0) { + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + } else { + /* + * This should only happen for newly created + * files that haven't had the znode data filled + * in yet. + */ + *userp = 0; + *groupp = 0; + } + } + return (error); +} + +#ifdef HAVE_ZPL +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + default: + return (ENOTSUP); + } + return (0); +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) { + *bufsizep = 0; + return (0); + } + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + fuidstr_to_sid(zfsvfs, za.za_name, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} +EXPORT_SYMBOL(zfs_userspace_many); + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (ENOENT); + } + fuid = FUID_ENCODE(domainid, rid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[32]; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) + return (0); + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} +EXPORT_SYMBOL(zfs_userspace_one); + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) + return (EINVAL); + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (ENOTSUP); + + objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : + &zfsvfs->z_groupquota_obj; + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} +EXPORT_SYMBOL(zfs_set_userquota); + +boolean_t +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +{ + char buf[32]; + uint64_t used, quota, usedobj, quotaobj; + int err; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)fuid); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} +EXPORT_SYMBOL(zfs_fuid_overquota); + +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) +{ + uint64_t fuid; + uint64_t quotaobj; + + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + fuid = isgroup ? zp->z_gid : zp->z_uid; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} +EXPORT_SYMBOL(zfs_owner_overquota); + +int +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + uint64_t zval; + int i, error; + uint64_t sa_obj; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + if (error) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error) { + goto out; + } else if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printk("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + error = ENOTSUP; + goto out; + } + if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) + goto out; + zfsvfs->z_norm = (int)zval; + + if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) + goto out; + zfsvfs->z_utf8 = (zval != 0); + + if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) + goto out; + zfsvfs->z_case = (uint_t)zval; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error) + return (error); + } else { + /* + * Pre SA versions file systems should never touch + * either the attribute registration or layout objects. + */ + sa_obj = 0; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error) + goto out; + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error) + goto out; + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error && error != ENOENT) + goto out; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + *zfvp = zfsvfs; + return (0); + +out: + dmu_objset_disown(os, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); +} static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) @@ -566,9 +883,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) /* * Set the objset user_ptr to track its zfsvfs. */ - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't @@ -588,41 +907,42 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) else zfs_unlinked_drain(zfsvfs); - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - if (zil_disable) { - zil_destroy(zfsvfs->z_log, 0); - zfsvfs->z_log = NULL; - } else { - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest - * doesn't use readonly mounts, where - * zfs_unlinked_drain() isn't called.) This is because - * ziltest causes spa_sync() to think it's committed, - * but actually it is not, so the intent log contains - * many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated - * in a yet later txg. This would write a "create - * object N" record to the intent log. Normally, this - * would be fine because the spa_sync() would have - * written out the fact that object N is free, before - * we could write the "create object N" intent log - * record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } @@ -630,103 +950,96 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) return (0); } -static void -zfs_freezfsvfs(zfsvfs_t *zfsvfs) +void +zfsvfs_free(zfsvfs_t *zfsvfs) { + int i; + extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); + mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } -static int +#ifdef HAVE_FUID_FEATURES +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} +#endif /* HAVE_FUID_FEATURES */ + +int zfs_domount(vfs_t *vfsp, char *osname) { - dev_t mount_dev; - uint64_t recordsize, readonly; + uint64_t recordsize, fsid_guid; int error = 0; - int mode; zfsvfs_t *zfsvfs; - znode_t *zp = NULL; ASSERT(vfsp); ASSERT(osname); - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + error = zfsvfs_create(osname, &zfsvfs); + if (error) + return (error); zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; - if (zfs_create_unique_device(&mount_dev) == -1) { - error = ENODEV; - goto out; - } - ASSERT(vfs_devismounted(mount_dev) == 0); - - if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, - NULL)) + if ((error = dsl_prop_get_integer(osname, "recordsize", + &recordsize, NULL))) goto out; - vfsp->vfs_dev = mount_dev; - vfsp->vfs_fstype = zfsfstype; vfsp->vfs_bsize = recordsize; vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_data = zfsvfs; - if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) - goto out; - - mode = DS_MODE_OWNER; - if (readonly) - mode |= DS_MODE_READONLY; - - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (error == EROFS) { - mode = DS_MODE_OWNER | DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, - &zfsvfs->z_os); - } - - if (error) - goto out; - - if (error = zfs_init_fs(zfsvfs, &zp)) - goto out; - - /* The call to zfs_init_fs leaves the vnode held, release it here. */ - VN_RELE(ZTOV(zp)); + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8); +#ifdef HAVE_FUID_FEATURES /* * Set features for file system. */ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids) { - vfs_set_feature(vfsp, VFSFT_XVATTR); - vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); - vfs_set_feature(vfsp, VFSFT_ACLONCREATE); - } + zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); @@ -735,17 +1048,23 @@ zfs_domount(vfs_t *vfsp, char *osname) vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); +#endif /* HAVE_FUID_FEATURES */ if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; - ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); - if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) + if ((error = dsl_prop_get_integer(osname,"xattr",&pval,NULL))) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } @@ -754,15 +1073,13 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsctl_create(zfsvfs); out: if (error) { - if (zfsvfs->z_os) - dmu_objset_close(zfsvfs->z_os); - zfs_freezfsvfs(zfsvfs); - } else { - atomic_add_32(&zfs_active_fs_count, 1); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); + zfsvfs_free(zfsvfs); } return (error); } +EXPORT_SYMBOL(zfs_domount); void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) @@ -799,9 +1116,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); @@ -809,270 +1123,39 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) vscan_changed_cb, zfsvfs) == 0); } } +EXPORT_SYMBOL(zfs_unregister_callbacks); +#ifdef HAVE_MLSLABEL /* - * Convert a decimal digit string to a uint64_t integer. - */ -static int -str_to_uint64(char *str, uint64_t *objnum) -{ - uint64_t num = 0; - - while (*str) { - if (*str < '0' || *str > '9') - return (EINVAL); - - num = num*10 + *str++ - '0'; - } - - *objnum = num; - return (0); -} - -/* - * The boot path passed from the boot loader is in the form of - * "rootpool-name/root-filesystem-object-number'. Convert this - * string to a dataset name: "rootpool-name/root-filesystem-name". + * zfs_check_global_label: + * Check that the hex label string is appropriate for the dataset + * being mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. */ -static int -zfs_parse_bootfs(char *bpath, char *outpath) +int +zfs_check_global_label(const char *dsname, const char *hexsl) { - char *slashp; - uint64_t objnum; - int error; - - if (*bpath == 0 || *bpath == '/') - return (EINVAL); - - (void) strcpy(outpath, bpath); - - slashp = strchr(bpath, '/'); - - /* if no '/', just return the pool name */ - if (slashp == NULL) { + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); - } - - /* if not a number, just return the root dataset name */ - if (str_to_uint64(slashp+1, &objnum)) { + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (EACCES); + return (rdonly ? 0 : EACCES); } - - *slashp = '\0'; - error = dsl_dsobj_to_dsname(bpath, objnum, outpath); - *slashp = '/'; - - return (error); + return (EACCES); } +#endif /* HAVE_MLSLABEL */ -static int -zfs_mountroot(vfs_t *vfsp, enum whymountroot why) -{ - int error = 0; - static int zfsrootdone = 0; - zfsvfs_t *zfsvfs = NULL; - znode_t *zp = NULL; - vnode_t *vp = NULL; - char *zfs_bootfs; - char *zfs_devid; - - ASSERT(vfsp); - - /* - * The filesystem that we mount as root is defined in the - * boot property "zfs-bootfs" with a format of - * "poolname/root-dataset-objnum". - */ - if (why == ROOT_INIT) { - if (zfsrootdone++) - return (EBUSY); - /* - * the process of doing a spa_load will require the - * clock to be set before we could (for example) do - * something better by looking at the timestamp on - * an uberblock, so just set it to -1. - */ - clkset(-1); - - if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { - cmn_err(CE_NOTE, "spa_get_bootfs: can not get " - "bootfs name"); - return (EINVAL); - } - zfs_devid = spa_get_bootprop("diskdevid"); - error = spa_import_rootpool(rootfs.bo_name, zfs_devid); - if (zfs_devid) - spa_free_bootprop(zfs_devid); - if (error) { - spa_free_bootprop(zfs_bootfs); - cmn_err(CE_NOTE, "spa_import_rootpool: error %d", - error); - return (error); - } - if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { - spa_free_bootprop(zfs_bootfs); - cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", - error); - return (error); - } - - spa_free_bootprop(zfs_bootfs); - - if (error = vfs_lock(vfsp)) - return (error); - - if (error = zfs_domount(vfsp, rootfs.bo_name)) { - cmn_err(CE_NOTE, "zfs_domount: error %d", error); - goto out; - } - - zfsvfs = (zfsvfs_t *)vfsp->vfs_data; - ASSERT(zfsvfs); - if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { - cmn_err(CE_NOTE, "zfs_zget: error %d", error); - goto out; - } - - vp = ZTOV(zp); - mutex_enter(&vp->v_lock); - vp->v_flag |= VROOT; - mutex_exit(&vp->v_lock); - rootvp = vp; - - /* - * Leave rootvp held. The root file system is never unmounted. - */ - - vfs_add((struct vnode *)0, vfsp, - (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); -out: - vfs_unlock(vfsp); - return (error); - } else if (why == ROOT_REMOUNT) { - readonly_changed_cb(vfsp->vfs_data, B_FALSE); - vfsp->vfs_flag |= VFS_REMOUNT; - - /* refresh mount options */ - zfs_unregister_callbacks(vfsp->vfs_data); - return (zfs_register_callbacks(vfsp)); - - } else if (why == ROOT_UNMOUNT) { - zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); - (void) zfs_sync(vfsp, 0, 0); - return (0); - } - - /* - * if "why" is equal to anything else other than ROOT_INIT, - * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. - */ - return (ENOTSUP); -} - -/*ARGSUSED*/ -static int -zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) -{ - char *osname; - pathname_t spn; - int error = 0; - uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? - UIO_SYSSPACE : UIO_USERSPACE; - int canwrite; - - if (mvp->v_type != VDIR) - return (ENOTDIR); - - mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_REMOUNT) == 0 && - (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { - mutex_exit(&mvp->v_lock); - return (EBUSY); - } - mutex_exit(&mvp->v_lock); - - /* - * ZFS does not support passing unparsed data in via MS_DATA. - * Users should use the MS_OPTIONSTR interface; this means - * that all option parsing is already done and the options struct - * can be interrogated. - */ - if ((uap->flags & MS_DATA) && uap->datalen > 0) - return (EINVAL); - - /* - * Get the objset name (the "special" mount argument). - */ - if (error = pn_get(uap->spec, fromspace, &spn)) - return (error); - - osname = spn.pn_path; - - /* - * Check for mount privilege? - * - * If we don't have privilege then see if - * we have local permission to allow it - */ - error = secpolicy_fs_mount(cr, mvp, vfsp); - if (error) { - error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); - if (error == 0) { - vattr_t vattr; - - /* - * Make sure user is the owner of the mount point - * or has sufficient privileges. - */ - - vattr.va_mask = AT_UID; - - if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { - goto out; - } - - if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && - VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { - error = EPERM; - goto out; - } - - secpolicy_fs_mount_clearopts(cr, vfsp); - } else { - goto out; - } - } - - /* - * Refuse to mount a filesystem if we are in a local zone and the - * dataset is not visible. - */ - if (!INGLOBALZONE(curproc) && - (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { - error = EPERM; - goto out; - } - - /* - * When doing a remount, we simply refresh our temporary properties - * according to those options set in the current VFS options. - */ - if (uap->flags & MS_REMOUNT) { - /* refresh mount options */ - zfs_unregister_callbacks(vfsp->vfs_data); - error = zfs_register_callbacks(vfsp); - goto out; - } - - error = zfs_domount(vfsp, osname); - -out: - pn_free(&spn); - return (error); -} - -static int +int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; @@ -1120,7 +1203,7 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) /* * We're a zfs filesystem. */ - (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); + (void) strcpy(statp->f_basetype, MNTTYPE_ZFS); statp->f_flag = vf_to_stf(vfsp->vfs_flag); @@ -1135,8 +1218,9 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) ZFS_EXIT(zfsvfs); return (0); } +EXPORT_SYMBOL(zfs_statvfs); -static int +int zfs_root(vfs_t *vfsp, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; @@ -1152,6 +1236,7 @@ zfs_root(vfs_t *vfsp, vnode_t **vpp) ZFS_EXIT(zfsvfs); return (error); } +EXPORT_SYMBOL(zfs_root); /* * Teardown the zfsvfs::z_os. @@ -1208,7 +1293,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_dbuf) { + if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count > 0); zfs_znode_dmu_fini(zp); } @@ -1241,16 +1326,16 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) /* * Evict cached data */ - if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); - } + if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os)) + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ -static int +int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) { zfsvfs_t *zfsvfs = vfsp->vfs_data; @@ -1259,9 +1344,8 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { - ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), - ZFS_DELEG_PERM_MOUNT, cr); - if (ret) + if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } @@ -1316,14 +1400,14 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) /* * Unset the objset user_ptr. */ - mutex_enter(&os->os->os_user_ptr_lock); + mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); - mutex_exit(&os->os->os_user_ptr_lock); + mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ - dmu_objset_close(os); + dmu_objset_disown(os, zfsvfs); } /* @@ -1334,8 +1418,9 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) return (0); } +EXPORT_SYMBOL(zfs_umount); -static int +int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; @@ -1400,11 +1485,13 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); - if (err = zfs_zget(zfsvfs, object, &zp)) { + if ((err = zfs_zget(zfsvfs, object, &zp))) { ZFS_EXIT(zfsvfs); return (err); } - zp_gen = zp->z_phys->zp_gen & gen_mask; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { @@ -1415,9 +1502,13 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) } *vpp = ZTOV(zp); + if (*vpp) + zfs_inode_update(VTOZ(*vpp)); + ZFS_EXIT(zfsvfs); return (0); } +EXPORT_SYMBOL(zfs_vget); /* * Block out VOPs and close zfsvfs_t::z_os @@ -1426,36 +1517,47 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) * 'z_teardown_inactive_lock' write held. */ int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - - *mode = zfsvfs->z_os->os_mode; - dmu_objset_name(zfsvfs->z_os, name); - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); return (0); } +EXPORT_SYMBOL(zfs_suspend_fs); /* * Reopen zfsvfs_t::z_os and release VOPs. */ int -zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { - int err; + int err, err2; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, + &zfsvfs->z_os); if (err) { zfsvfs->z_os = NULL; } else { znode_t *zp; + uint64_t sa_obj = 0; + + err2 = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj); + + if ((err || err2) && zfsvfs->z_version >= ZPL_VERSION_SA) + goto bail; + + + if ((err = sa_setup(zfsvfs->z_os, sa_obj, + zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) + goto bail; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); @@ -1474,6 +1576,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) } +bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); @@ -1488,76 +1591,24 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) } return (err); } +EXPORT_SYMBOL(zfs_resume_fs); static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; - int i; - - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - - zfs_fuid_destroy(zfsvfs); - zfs_freezfsvfs(zfsvfs); - - atomic_add_32(&zfs_active_fs_count, -1); -} -/* - * VFS_INIT() initialization. Note that there is no VFS_FINI(), - * so we can't safely do any non-idempotent initialization here. - * Leave that to zfs_init() and zfs_fini(), which are called - * from the module's _init() and _fini() entry points. - */ -/*ARGSUSED*/ -static int -zfs_vfsinit(int fstype, char *name) -{ - int error; - - zfsfstype = fstype; - - /* - * Setup vfsops and vnodeops tables. - */ - error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); - if (error != 0) { - cmn_err(CE_WARN, "zfs: bad vfs ops template"); - } - - error = zfs_create_op_tables(); - if (error) { - zfs_remove_op_tables(); - cmn_err(CE_WARN, "zfs: bad vnode ops template"); - (void) vfs_freevfsops_by_type(zfsfstype); - return (error); - } - - mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - - /* - * Unique major number for all zfs mounts. - * If we run out of 32-bit minors, we'll getudev() another major. - */ - zfs_major = ddi_name_to_major(ZFS_DRIVER); - zfs_minor = ZFS_MIN_MINOR; - - return (0); + zfsvfs_free(zfsvfs); } +#endif /* HAVE_ZPL */ void zfs_init(void) { - /* - * Initialize .zfs directory structures - */ zfsctl_init(); - - /* - * Initialize znode cache, vnode ops, etc... - */ zfs_znode_init(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void @@ -1567,62 +1618,78 @@ zfs_fini(void) zfs_znode_fini(); } +#ifdef HAVE_ZPL int -zfs_busy(void) -{ - return (zfs_active_fs_count != 0); -} - -int -zfs_set_version(const char *name, uint64_t newvers) +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; - objset_t *os; + objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; - uint64_t curvers; - - /* - * XXX for now, require that the filesystem be unmounted. Would - * be nice to find the zfsvfs_t and just update that if - * possible. - */ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (EINVAL); - error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); - if (error) - return (error); + if (newvers < zfsvfs->z_version) + return (EINVAL); - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &curvers); - if (error) - goto out; - if (newvers < curvers) { - error = EINVAL; - goto out; - } + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (ENOTSUP); tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - goto out; + return (error); } - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, - &newvers, tx); - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, - dmu_objset_id(os)); + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT3U(error, ==, 0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", + zfsvfs->z_version, newvers, dmu_objset_id(os)); + dmu_tx_commit(tx); -out: - dmu_objset_close(os); - return (error); + zfsvfs->z_version = newvers; + +#ifdef HAVE_FUID_FEATURES + if (zfsvfs->z_version >= ZPL_VERSION_FUID) + zfs_set_fuid_feature(zfsvfs); +#endif /* HAVE_FUID_FEATURES */ + + return (0); } +EXPORT_SYMBOL(zfs_set_version); +#endif /* HAVE_ZPL */ /* * Read a property stored within the master node. @@ -1665,16 +1732,3 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) } return (error); } - -static vfsdef_t vfw = { - VFSDEF_VERSION, - MNTTYPE_ZFS, - zfs_vfsinit, - VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| - VSW_XID, - &zfs_mntopts -}; - -struct modlfs zfs_modlfs = { - &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw -};