Update core ZFS code from build 121 to build 141.
[zfs.git] / module / zfs / zfs_vfsops.c
index d03f92b..f68dde8 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -46,6 +47,7 @@
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
+#include <sys/sa.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
@@ -60,6 +62,8 @@
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
+#include <sys/sa.h>
+#include "zfs_comutil.h"
 
 int zfsfstype;
 vfsops_t *zfs_vfsops = NULL;
@@ -163,8 +167,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 
                if (zfsvfs->z_log != NULL)
                        zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
-               else
-                       txg_wait_synced(dp, 0);
+
                ZFS_EXIT(zfsvfs);
        } else {
                /*
@@ -381,14 +384,6 @@ vscan_changed_cb(void *arg, uint64_t newval)
 }
 
 static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
-       zfsvfs_t *zfsvfs = arg;
-
-       zfsvfs->z_acl_mode = newval;
-}
-
-static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
        zfsvfs_t *zfsvfs = arg;
@@ -518,8 +513,6 @@ zfs_register_callbacks(vfs_t *vfsp)
        error = error ? error : dsl_prop_register(ds,
            "snapdir", snapdir_changed_cb, zfsvfs);
        error = error ? error : dsl_prop_register(ds,
-           "aclmode", acl_mode_changed_cb, zfsvfs);
-       error = error ? error : dsl_prop_register(ds,
            "aclinherit", acl_inherit_changed_cb, zfsvfs);
        error = error ? error : dsl_prop_register(ds,
            "vscan", vscan_changed_cb, zfsvfs);
@@ -560,7 +553,6 @@ unregister:
        (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
        (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
        (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
-       (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
        (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
            zfsvfs);
        (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
@@ -568,69 +560,59 @@ unregister:
 
 }
 
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
-    int64_t delta, dmu_tx_t *tx)
-{
-       uint64_t used = 0;
-       char buf[32];
-       int err;
-       uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-
-       if (delta == 0)
-               return;
-
-       (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
-       err = zap_lookup(os, obj, buf, 8, 1, &used);
-       ASSERT(err == 0 || err == ENOENT);
-       /* no underflow/overflow */
-       ASSERT(delta > 0 || used >= -delta);
-       ASSERT(delta < 0 || used + delta > used);
-       used += delta;
-       if (used == 0)
-               err = zap_remove(os, obj, buf, tx);
-       else
-               err = zap_update(os, obj, buf, 8, 1, &used, tx);
-       ASSERT(err == 0);
-}
-
-static void
-zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
-    void *oldbonus, void *newbonus,
-    uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+    uint64_t *userp, uint64_t *groupp)
 {
-       znode_phys_t *oldznp = oldbonus;
-       znode_phys_t *newznp = newbonus;
+       znode_phys_t *znp = data;
+       int error = 0;
 
-       if (bonustype != DMU_OT_ZNODE)
-               return;
+       /*
+        * Is it a valid type of object to track?
+        */
+       if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+               return (ENOENT);
 
-       /* We charge 512 for the dnode (if it's allocated). */
-       if (oldznp->zp_gen != 0)
-               oldused += DNODE_SIZE;
-       if (newznp->zp_gen != 0)
-               newused += DNODE_SIZE;
+       /*
+        * If we have a NULL data pointer
+        * then assume the id's aren't changing and
+        * return EEXIST to the dmu to let it know to
+        * use the same ids
+        */
+       if (data == NULL)
+               return (EEXIST);
 
-       if (oldznp->zp_uid == newznp->zp_uid) {
-               uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
+       if (bonustype == DMU_OT_ZNODE) {
+               *userp = znp->zp_uid;
+               *groupp = znp->zp_gid;
        } else {
-               uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
-               uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
-       }
+               int hdrsize;
 
-       if (oldznp->zp_gid == newznp->zp_gid) {
-               uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
-       } else {
-               uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
-               uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
+               ASSERT(bonustype == DMU_OT_SA);
+               hdrsize = sa_hdrsize(data);
+
+               if (hdrsize != 0) {
+                       *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+                           SA_UID_OFFSET));
+                       *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+                           SA_GID_OFFSET));
+               } else {
+                       /*
+                        * This should only happen for newly created
+                        * files that haven't had the znode data filled
+                        * in yet.
+                        */
+                       *userp = 0;
+                       *groupp = 0;
+               }
        }
+       return (error);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
-       extern uint64_t strtonum(const char *str, char **nptr);
        uint64_t fuid;
        const char *domain;
 
@@ -811,7 +793,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 }
 
 boolean_t
-zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
        char buf[32];
        uint64_t used, quota, usedobj, quotaobj;
@@ -834,33 +816,57 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
        return (used >= quota);
 }
 
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
+{
+       uint64_t fuid;
+       uint64_t quotaobj;
+       uid_t id;
+
+       quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+       id = isgroup ? zp->z_gid : zp->z_uid;
+
+       if (quotaobj == 0 || zfsvfs->z_replay)
+               return (B_FALSE);
+
+       if (IS_EPHEMERAL(id)) {
+               VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+                   isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs),
+                   &fuid, sizeof (fuid)));
+       } else {
+               fuid = (uint64_t)id;
+       }
+
+       return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
 int
-zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
        objset_t *os;
        zfsvfs_t *zfsvfs;
        uint64_t zval;
        int i, error;
+       uint64_t sa_obj;
 
-       if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
-               return (error);
-       if (zval)
-               mode |= DS_MODE_READONLY;
+       zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
-       error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
-       if (error == EROFS) {
-               mode |= DS_MODE_READONLY;
-               error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
-       }
-       if (error)
+       /*
+        * We claim to always be readonly so we can open snapshots;
+        * other ZPL code will prevent us from writing to snapshots.
+        */
+       error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+       if (error) {
+               kmem_free(zfsvfs, sizeof (zfsvfs_t));
                return (error);
+       }
 
        /*
         * Initialize the zfs-specific filesystem structure.
         * Should probably make this a kmem cache, shuffle fields,
         * and just bzero up to z_hold_mtx[].
         */
-       zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
        zfsvfs->z_vfs = NULL;
        zfsvfs->z_parent = zfsvfs;
        zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
@@ -870,15 +876,15 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
        error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
        if (error) {
                goto out;
-       } else if (zfsvfs->z_version > ZPL_VERSION) {
-               (void) printf("Mismatched versions:  File system "
-                   "is version %llu on-disk format, which is "
-                   "incompatible with this software version %lld!",
-                   (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+       } else if (zfsvfs->z_version >
+           zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+               (void) printf("Can't mount a version %lld file system "
+                   "on a version %lld pool\n. Pool must be upgraded to mount "
+                   "this file system.", (u_longlong_t)zfsvfs->z_version,
+                   (u_longlong_t)spa_version(dmu_objset_spa(os)));
                error = ENOTSUP;
                goto out;
        }
-
        if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
                goto out;
        zfsvfs->z_norm = (int)zval;
@@ -900,6 +906,26 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
                zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
        zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+       zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+       if (zfsvfs->z_use_sa) {
+               /* should either have both of these objects or none */
+               error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+                   &sa_obj);
+               if (error)
+                       return (error);
+       } else {
+               /*
+                * Pre SA versions file systems should never touch
+                * either the attribute registration or layout objects.
+                */
+               sa_obj = 0;
+       }
+
+       zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+
+       if (zfsvfs->z_version >= ZPL_VERSION_SA)
+               sa_register_update_callback(os, zfs_sa_upgrade);
 
        error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
            &zfsvfs->z_root);
@@ -944,12 +970,12 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-       *zvp = zfsvfs;
+       *zfvp = zfsvfs;
        return (0);
 
 out:
-       dmu_objset_close(os);
-       *zvp = NULL;
+       dmu_objset_disown(os, zfsvfs);
+       *zfvp = NULL;
        kmem_free(zfsvfs, sizeof (zfsvfs_t));
        return (error);
 }
@@ -966,15 +992,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
        /*
         * Set the objset user_ptr to track its zfsvfs.
         */
-       mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+       mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
        dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-       mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+       mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
        zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-       if (zil_disable) {
-               zil_destroy(zfsvfs->z_log, 0);
-               zfsvfs->z_log = NULL;
-       }
 
        /*
         * If we are not mounting (ie: online recv), then we don't
@@ -994,34 +1016,36 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
                else
                        zfs_unlinked_drain(zfsvfs);
 
-               if (zfsvfs->z_log) {
-                       /*
-                        * Parse and replay the intent log.
-                        *
-                        * Because of ziltest, this must be done after
-                        * zfs_unlinked_drain().  (Further note: ziltest
-                        * doesn't use readonly mounts, where
-                        * zfs_unlinked_drain() isn't called.)  This is because
-                        * ziltest causes spa_sync() to think it's committed,
-                        * but actually it is not, so the intent log contains
-                        * many txg's worth of changes.
-                        *
-                        * In particular, if object N is in the unlinked set in
-                        * the last txg to actually sync, then it could be
-                        * actually freed in a later txg and then reallocated
-                        * in a yet later txg.  This would write a "create
-                        * object N" record to the intent log.  Normally, this
-                        * would be fine because the spa_sync() would have
-                        * written out the fact that object N is free, before
-                        * we could write the "create object N" intent log
-                        * record.
-                        *
-                        * But when we are in ziltest mode, we advance the "open
-                        * txg" without actually spa_sync()-ing the changes to
-                        * disk.  So we would see that object N is still
-                        * allocated and in the unlinked set, and there is an
-                        * intent log record saying to allocate it.
-                        */
+               /*
+                * Parse and replay the intent log.
+                *
+                * Because of ziltest, this must be done after
+                * zfs_unlinked_drain().  (Further note: ziltest
+                * doesn't use readonly mounts, where
+                * zfs_unlinked_drain() isn't called.)  This is because
+                * ziltest causes spa_sync() to think it's committed,
+                * but actually it is not, so the intent log contains
+                * many txg's worth of changes.
+                *
+                * In particular, if object N is in the unlinked set in
+                * the last txg to actually sync, then it could be
+                * actually freed in a later txg and then reallocated
+                * in a yet later txg.  This would write a "create
+                * object N" record to the intent log.  Normally, this
+                * would be fine because the spa_sync() would have
+                * written out the fact that object N is free, before
+                * we could write the "create object N" intent log
+                * record.
+                *
+                * But when we are in ziltest mode, we advance the "open
+                * txg" without actually spa_sync()-ing the changes to
+                * disk.  So we would see that object N is still
+                * allocated and in the unlinked set, and there is an
+                * intent log record saying to allocate it.
+                */
+               if (zil_replay_disable) {
+                       zil_destroy(zfsvfs->z_log, B_FALSE);
+               } else {
                        zfsvfs->z_replay = B_TRUE;
                        zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
                        zfsvfs->z_replay = B_FALSE;
@@ -1070,7 +1094,9 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
                vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
                vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
                vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+               vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
        }
+       zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
@@ -1084,7 +1110,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
        ASSERT(vfsp);
        ASSERT(osname);
 
-       error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
+       error = zfsvfs_create(osname, &zfsvfs);
        if (error)
                return (error);
        zfsvfs->z_vfs = vfsp;
@@ -1135,6 +1161,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
                vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
                vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
        }
+       vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
        if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
                uint64_t pval;
@@ -1146,9 +1173,9 @@ zfs_domount(vfs_t *vfsp, char *osname)
                xattr_changed_cb(zfsvfs, pval);
                zfsvfs->z_issnap = B_TRUE;
 
-               mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+               mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
                dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-               mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+               mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
        } else {
                error = zfsvfs_setup(zfsvfs, B_TRUE);
        }
@@ -1157,7 +1184,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
                zfsctl_create(zfsvfs);
 out:
        if (error) {
-               dmu_objset_close(zfsvfs->z_os);
+               dmu_objset_disown(zfsvfs->z_os, zfsvfs);
                zfsvfs_free(zfsvfs);
        } else {
                atomic_add_32(&zfs_active_fs_count, 1);
@@ -1201,9 +1228,6 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
                VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
                    zfsvfs) == 0);
 
-               VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-                   zfsvfs) == 0);
-
                VERIFY(dsl_prop_unregister(ds, "aclinherit",
                    acl_inherit_changed_cb, zfsvfs) == 0);
 
@@ -1267,6 +1291,139 @@ zfs_parse_bootfs(char *bpath, char *outpath)
        return (error);
 }
 
+/*
+ * zfs_check_global_label:
+ *     Check that the hex label string is appropriate for the dataset
+ *     being mounted into the global_zone proper.
+ *
+ *     Return an error if the hex label string is not default or
+ *     admin_low/admin_high.  For admin_low labels, the corresponding
+ *     dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+       if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+               return (0);
+       if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+               return (0);
+       if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+               /* must be readonly */
+               uint64_t rdonly;
+
+               if (dsl_prop_get_integer(dsname,
+                   zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+                       return (EACCES);
+               return (rdonly ? 0 : EACCES);
+       }
+       return (EACCES);
+}
+
+/*
+ * zfs_mount_label_policy:
+ *     Determine whether the mount is allowed according to MAC check.
+ *     by comparing (where appropriate) label of the dataset against
+ *     the label of the zone being mounted into.  If the dataset has
+ *     no label, create one.
+ *
+ *     Returns:
+ *              0 :    access allowed
+ *             >0 :    error code, such as EACCES
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+       int             error, retv;
+       zone_t          *mntzone = NULL;
+       ts_label_t      *mnt_tsl;
+       bslabel_t       *mnt_sl;
+       bslabel_t       ds_sl;
+       char            ds_hexsl[MAXNAMELEN];
+
+       retv = EACCES;                          /* assume the worst */
+
+       /*
+        * Start by getting the dataset label if it exists.
+        */
+       error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+           1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+       if (error)
+               return (EACCES);
+
+       /*
+        * If labeling is NOT enabled, then disallow the mount of datasets
+        * which have a non-default label already.  No other label checks
+        * are needed.
+        */
+       if (!is_system_labeled()) {
+               if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+                       return (0);
+               return (EACCES);
+       }
+
+       /*
+        * Get the label of the mountpoint.  If mounting into the global
+        * zone (i.e. mountpoint is not within an active zone and the
+        * zoned property is off), the label must be default or
+        * admin_low/admin_high only; no other checks are needed.
+        */
+       mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
+       if (mntzone->zone_id == GLOBAL_ZONEID) {
+               uint64_t zoned;
+
+               zone_rele(mntzone);
+
+               if (dsl_prop_get_integer(osname,
+                   zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+                       return (EACCES);
+               if (!zoned)
+                       return (zfs_check_global_label(osname, ds_hexsl));
+               else
+                       /*
+                        * This is the case of a zone dataset being mounted
+                        * initially, before the zone has been fully created;
+                        * allow this mount into global zone.
+                        */
+                       return (0);
+       }
+
+       mnt_tsl = mntzone->zone_slabel;
+       ASSERT(mnt_tsl != NULL);
+       label_hold(mnt_tsl);
+       mnt_sl = label2bslabel(mnt_tsl);
+
+       if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+               /*
+                * The dataset doesn't have a real label, so fabricate one.
+                */
+               char *str = NULL;
+
+               if (l_to_str_internal(mnt_sl, &str) == 0 &&
+                   dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+                   ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
+                       retv = 0;
+               if (str != NULL)
+                       kmem_free(str, strlen(str) + 1);
+       } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+               /*
+                * Now compare labels to complete the MAC check.  If the
+                * labels are equal then allow access.  If the mountpoint
+                * label dominates the dataset label, allow readonly access.
+                * Otherwise, access is denied.
+                */
+               if (blequal(mnt_sl, &ds_sl))
+                       retv = 0;
+               else if (bldominates(mnt_sl, &ds_sl)) {
+                       vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+                       retv = 0;
+               }
+       }
+
+       label_rele(mnt_tsl);
+       zone_rele(mntzone);
+       return (retv);
+}
+
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
@@ -1419,8 +1576,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
         */
        error = secpolicy_fs_mount(cr, mvp, vfsp);
        if (error) {
-               error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
-               if (error == 0) {
+               if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
                        vattr_t         vattr;
 
                        /*
@@ -1430,16 +1586,14 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 
                        vattr.va_mask = AT_UID;
 
-                       if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
+                       if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
                                goto out;
                        }
 
                        if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
                            VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
-                               error = EPERM;
                                goto out;
                        }
-
                        secpolicy_fs_mount_clearopts(cr, vfsp);
                } else {
                        goto out;
@@ -1456,6 +1610,10 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
                goto out;
        }
 
+       error = zfs_mount_label_policy(vfsp, osname);
+       if (error)
+               goto out;
+
        /*
         * When doing a remount, we simply refresh our temporary properties
         * according to those options set in the current VFS options.
@@ -1617,7 +1775,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
        mutex_enter(&zfsvfs->z_znodes_lock);
        for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
            zp = list_next(&zfsvfs->z_all_znodes, zp))
-               if (zp->z_dbuf) {
+               if (zp->z_sa_hdl) {
                        ASSERT(ZTOV(zp)->v_count > 0);
                        zfs_znode_dmu_fini(zp);
                }
@@ -1668,9 +1826,8 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 
        ret = secpolicy_fs_unmount(cr, vfsp);
        if (ret) {
-               ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
-                   ZFS_DELEG_PERM_MOUNT, cr);
-               if (ret)
+               if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
+                   ZFS_DELEG_PERM_MOUNT, cr))
                        return (ret);
        }
 
@@ -1725,14 +1882,14 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
                /*
                 * Unset the objset user_ptr.
                 */
-               mutex_enter(&os->os->os_user_ptr_lock);
+               mutex_enter(&os->os_user_ptr_lock);
                dmu_objset_set_user(os, NULL);
-               mutex_exit(&os->os->os_user_ptr_lock);
+               mutex_exit(&os->os_user_ptr_lock);
 
                /*
                 * Finally release the objset
                 */
-               dmu_objset_close(os);
+               dmu_objset_disown(os, zfsvfs);
        }
 
        /*
@@ -1813,7 +1970,9 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
                ZFS_EXIT(zfsvfs);
                return (err);
        }
-       zp_gen = zp->z_phys->zp_gen & gen_mask;
+       (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+           sizeof (uint64_t));
+       zp_gen = zp_gen & gen_mask;
        if (zp_gen == 0)
                zp_gen = 1;
        if (zp->z_unlinked || zp_gen != fid_gen) {
@@ -1835,17 +1994,13 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
  * 'z_teardown_inactive_lock' write held.
  */
 int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
        int error;
 
        if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
                return (error);
-
-       *modep = zfsvfs->z_os->os_mode;
-       if (name)
-               dmu_objset_name(zfsvfs->z_os, name);
-       dmu_objset_close(zfsvfs->z_os);
+       dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 
        return (0);
 }
@@ -1854,18 +2009,30 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
  * Reopen zfsvfs_t::z_os and release VOPs.
  */
 int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
 {
-       int err;
+       int err, err2;
 
        ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
        ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
-       err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+       err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
+           &zfsvfs->z_os);
        if (err) {
                zfsvfs->z_os = NULL;
        } else {
                znode_t *zp;
+               uint64_t sa_obj = 0;
+
+               err2 = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+                   ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+               if ((err || err2) && zfsvfs->z_version >= ZPL_VERSION_SA)
+                       goto bail;
+
+
+               zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj,
+                   zfs_attr_table,  ZPL_END);
 
                VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
@@ -1884,6 +2051,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
 
        }
 
+bail:
        /* release the VOPs */
        rw_exit(&zfsvfs->z_teardown_inactive_lock);
        rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
@@ -1906,9 +2074,11 @@ zfs_freevfs(vfs_t *vfsp)
 
        /*
         * If this is a snapshot, we have an extra VFS_HOLD on our parent
-        * from zfs_mount().  Release it here.
+        * from zfs_mount().  Release it here.  If we came through
+        * zfs_mountroot() instead, we didn't grab an extra hold, so
+        * skip the VFS_RELE for rootvfs.
         */
-       if (zfsvfs->z_issnap)
+       if (zfsvfs->z_issnap && (vfsp != rootvfs))
                VFS_RELE(zfsvfs->z_parent->z_vfs);
 
        zfsvfs_free(zfsvfs);
@@ -2000,13 +2170,23 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
        if (newvers < zfsvfs->z_version)
                return (EINVAL);
 
+       if (zfs_spa_version_map(newvers) >
+           spa_version(dmu_objset_spa(zfsvfs->z_os)))
+               return (ENOTSUP);
+
        tx = dmu_tx_create(os);
        dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+       if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+               dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+                   ZFS_SA_ATTRS);
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+       }
        error = dmu_tx_assign(tx, TXG_WAIT);
        if (error) {
                dmu_tx_abort(tx);
                return (error);
        }
+
        error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
            8, 1, &newvers, tx);
 
@@ -2015,9 +2195,24 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
                return (error);
        }
 
-       spa_history_internal_log(LOG_DS_UPGRADE,
-           dmu_objset_spa(os), tx, CRED(),
-           "oldver=%llu newver=%llu dataset = %llu",
+       if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+               uint64_t sa_obj;
+
+               ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+                   SPA_VERSION_SA);
+               sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+                   DMU_OT_NONE, 0, tx);
+
+               error = zap_add(os, MASTER_NODE_OBJ,
+                   ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+               ASSERT3U(error, ==, 0);
+
+               VERIFY(0 == sa_set_sa_object(os, sa_obj));
+               sa_register_update_callback(os, zfs_sa_upgrade);
+       }
+
+       spa_history_log_internal(LOG_DS_UPGRADE,
+           dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
            zfsvfs->z_version, newvers, dmu_objset_id(os));
 
        dmu_tx_commit(tx);