X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzfs_ctldir.c;h=b35f27d194db5bb24794dc1114b0075077da2ee0;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=208fc36295d077f33619d2f9fe2c24eb9059a940;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git

diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 208fc36..b35f27d 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -19,25 +19,29 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future.  The elements are built using the GFS primitives, as the hierarchy
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future.  The elements are built dynamically, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
- * 	ctldir ------> snapshotdir -------> snapshot
+ *	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
@@ -46,59 +50,85 @@
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
+ * corresponding inode.
  *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land.  The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount mount procedure.  Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
  *
  * The '.zfs', '.zfs/snapshot', and all directories created under
- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
- * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under).
  *
- * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
- * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
- * However, vnodes within these mounted on file systems have their v_vfsp
- * fields set to the head filesystem to make NFS happy (see
- * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
- * so that it cannot be freed until all snapshots have been unmounted.
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfs_sb_t.  However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfs_sb_t to make NFS happy.
  */
 
-#include <fs/fs_subr.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
-#include <sys/vfs_opreg.h>
-#include <sys/gfs.h>
+#include <sys/zfs_vnops.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
-#include <sys/sunddi.h>
-
+#include <sys/zpl.h>
 #include "zfs_namecheck.h"
 
-typedef struct zfsctl_node {
-	gfs_dir_t	zc_gfs_private;
-	uint64_t	zc_id;
-	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
-} zfsctl_node_t;
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
 
-typedef struct zfsctl_snapdir {
-	zfsctl_node_t	sd_node;
-	kmutex_t	sd_lock;
-	avl_tree_t	sd_snaps;
-} zfsctl_snapdir_t;
+/*
+ * Dedicated task queue for unmounting snapshots.
+ */
+static taskq_t *zfs_expire_taskq;
 
-typedef struct {
-	char		*se_name;
-	vnode_t		*se_root;
-	avl_node_t	se_node;
-} zfs_snapentry_t;
+static zfs_snapentry_t *
+zfsctl_sep_alloc(void)
+{
+	return kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+}
 
-static int
+void
+zfsctl_sep_free(zfs_snapentry_t *sep)
+{
+	kmem_free(sep->se_name, MAXNAMELEN);
+	kmem_free(sep->se_path, PATH_MAX);
+	kmem_free(sep, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Attempt to expire an automounted snapshot, unmounts are attempted every
+ * 'zfs_expire_snapshot' seconds until they succeed.  The work request is
+ * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t.
+ */
+static void
+zfsctl_expire_snapshot(void *data)
+{
+	zfs_snapentry_t *sep = (zfs_snapentry_t *)data;
+	zfs_sb_t *zsb = ITOZSB(sep->se_inode);
+	int error;
+
+	error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE);
+	if (error == EBUSY)
+		sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
+		    zfsctl_expire_snapshot, sep, TQ_SLEEP,
+		    ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+}
+
+int
 snapentry_compare(const void *a, const void *b)
 {
 	const zfs_snapentry_t *sa = a;
@@ -113,220 +143,193 @@ snapentry_compare(const void *a, const void *b)
 		return (0);
 }
 
-vnodeops_t *zfsctl_ops_root;
-vnodeops_t *zfsctl_ops_snapdir;
-vnodeops_t *zfsctl_ops_snapshot;
-
-static const fs_operation_def_t zfsctl_tops_root[];
-static const fs_operation_def_t zfsctl_tops_snapdir[];
-static const fs_operation_def_t zfsctl_tops_snapshot[];
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
-
-static gfs_opsvec_t zfsctl_opsvec[] = {
-	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
-	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
-	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
-	{ NULL }
-};
-
-/*
- * Root directory elements.  We have only a single static entry, 'snapshot'.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
-	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
-	{ NULL }
-};
-
-/* include . and .. in the calculation */
-#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
-    sizeof (gfs_dirent_t)) + 1)
-
-
-/*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories.  This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
- */
-void
-zfsctl_init(void)
+boolean_t
+zfsctl_is_node(struct inode *ip)
 {
-	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+	return (ITOZ(ip)->z_is_ctldir);
 }
 
-void
-zfsctl_fini(void)
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
 {
-	/*
-	 * Remove vfsctl vnode ops
-	 */
-	if (zfsctl_ops_root)
-		vn_freevnodeops(zfsctl_ops_root);
-	if (zfsctl_ops_snapdir)
-		vn_freevnodeops(zfsctl_ops_snapdir);
-	if (zfsctl_ops_snapshot)
-		vn_freevnodeops(zfsctl_ops_snapshot);
-
-	zfsctl_ops_root = NULL;
-	zfsctl_ops_snapdir = NULL;
-	zfsctl_ops_snapshot = NULL;
+	return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
 }
 
 /*
- * Return the inode number associated with the 'snapshot' directory.
+ * Allocate a new inode with the passed id and ops.
  */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
+static struct inode *
+zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
 {
-	ASSERT(index == 0);
-	return (ZFSCTL_INO_SNAPDIR);
+	struct timespec now = current_fs_time(zsb->z_sb);
+	struct inode *ip;
+	znode_t *zp;
+
+	ip = new_inode(zsb->z_sb);
+	if (ip == NULL)
+		return (NULL);
+
+	zp = ITOZ(ip);
+	ASSERT3P(zp->z_dirlocks, ==, NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	zp->z_id = id;
+	zp->z_unlinked = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_zn_prefetch = 0;
+	zp->z_moved = 0;
+	zp->z_sa_hdl = NULL;
+	zp->z_blksz = 0;
+	zp->z_seq = 0;
+	zp->z_mapcnt = 0;
+	zp->z_gen = 0;
+	zp->z_size = 0;
+	zp->z_atime[0] = 0;
+	zp->z_atime[1] = 0;
+	zp->z_links = 0;
+	zp->z_pflags = 0;
+	zp->z_uid = 0;
+	zp->z_gid = 0;
+	zp->z_mode = 0;
+	zp->z_sync_cnt = 0;
+	zp->z_is_zvol = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_TRUE;
+	zp->z_is_sa = B_FALSE;
+	zp->z_is_stale = B_FALSE;
+	ip->i_ino = id;
+	ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
+	ip->i_uid = SUID_TO_KUID(0);
+	ip->i_gid = SGID_TO_KGID(0);
+	ip->i_blkbits = SPA_MINBLOCKSHIFT;
+	ip->i_atime = now;
+	ip->i_mtime = now;
+	ip->i_ctime = now;
+	ip->i_fop = fops;
+	ip->i_op = ops;
+
+	if (insert_inode_locked(ip)) {
+		unlock_new_inode(ip);
+		iput(ip);
+		return (NULL);
+	}
+
+	mutex_enter(&zsb->z_znodes_lock);
+	list_insert_tail(&zsb->z_all_znodes, zp);
+	zsb->z_nr_znodes++;
+	membar_producer();
+	mutex_exit(&zsb->z_znodes_lock);
+
+	unlock_new_inode(ip);
+
+	return (ip);
 }
 
 /*
- * Create the '.zfs' directory.  This directory is cached as part of the VFS
- * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1.  This reference
- * is removed when the ctldir is destroyed in the unmount.
+ * Lookup the inode with given id, it will be allocated if needed.
  */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
+static struct inode *
+zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
 {
-	vnode_t *vp, *rvp;
-	zfsctl_node_t *zcp;
+	struct inode *ip = NULL;
 
-	ASSERT(zfsvfs->z_ctldir == NULL);
-
-	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
-	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
-	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = ZFSCTL_INO_ROOT;
-
-	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
-	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
-	VN_RELE(rvp);
+	while (ip == NULL) {
+		ip = ilookup(zsb->z_sb, (unsigned long)id);
+		if (ip)
+			break;
 
-	/*
-	 * We're only faking the fact that we have a root of a filesystem for
-	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
-	 * for us.
-	 */
-	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
+		/* May fail due to concurrent zfsctl_inode_alloc() */
+		ip = zfsctl_inode_alloc(zsb, id, fops, ops);
+	}
 
-	zfsvfs->z_ctldir = vp;
+	return (ip);
 }
 
 /*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
+ * Free zfsctl inode specific structures, currently there are none.
  */
 void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
+zfsctl_inode_destroy(struct inode *ip)
 {
-	VN_RELE(zfsvfs->z_ctldir);
-	zfsvfs->z_ctldir = NULL;
+	return;
 }
 
 /*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
+ * An inode is being evicted from the cache.
  */
-vnode_t *
-zfsctl_root(znode_t *zp)
+void
+zfsctl_inode_inactive(struct inode *ip)
 {
-	ASSERT(zfs_has_ctldir(zp));
-	VN_HOLD(zp->z_zfsvfs->z_ctldir);
-	return (zp->z_zfsvfs->z_ctldir);
+	if (zfsctl_is_snapdir(ip))
+		zfsctl_snapdir_inactive(ip);
 }
 
 /*
- * Common open routine.  Disallow any write access.
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the zfs_sb_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.  All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
  */
-/* ARGSUSED */
-static int
-zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
+int
+zfsctl_create(zfs_sb_t *zsb)
 {
-	if (flags & FWRITE)
-		return (EACCES);
+#if defined(CONFIG_64BIT)
+	ASSERT(zsb->z_ctldir == NULL);
 
-	return (0);
-}
+	zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT,
+	    &zpl_fops_root, &zpl_ops_root);
+	if (zsb->z_ctldir == NULL)
+		return (ENOENT);
 
-/*
- * Common close routine.  Nothing to do here.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
-    cred_t *cr, caller_context_t *ct)
-{
 	return (0);
+#else
+	return (EOPNOTSUPP);
+#endif /* CONFIG_64BIT */
 }
 
 /*
- * Common access routine.  Disallow writes.
+ * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  */
-/* ARGSUSED */
-static int
-zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
-    caller_context_t *ct)
+void
+zfsctl_destroy(zfs_sb_t *zsb)
 {
-	if (mode & VWRITE)
-		return (EACCES);
-
-	return (0);
+	iput(zsb->z_ctldir);
+	zsb->z_ctldir = NULL;
 }
 
 /*
- * Common getattr function.  Fill in basic information.
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
  */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+struct inode *
+zfsctl_root(znode_t *zp)
 {
-	zfsctl_node_t	*zcp = vp->v_data;
-	timestruc_t	now;
-
-	vap->va_uid = 0;
-	vap->va_gid = 0;
-	vap->va_rdev = 0;
-	/*
-	 * We are a purly virtual object, so we have no
-	 * blocksize or allocated blocks.
-	 */
-	vap->va_blksize = 0;
-	vap->va_nblocks = 0;
-	vap->va_seq = 0;
-	vap->va_fsid = vp->v_vfsp->vfs_dev;
-	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
-	    S_IROTH | S_IXOTH;
-	vap->va_type = VDIR;
-	/*
-	 * We live in the now (for atime).
-	 */
-	gethrestime(&now);
-	vap->va_atime = now;
-	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+	ASSERT(zfs_has_ctldir(zp));
+	igrab(ZTOZSB(zp)->z_ctldir);
+	return (ZTOZSB(zp)->z_ctldir);
 }
 
 /*ARGSUSED*/
-static int
-zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
 {
-	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_node_t	*zcp = vp->v_data;
-	uint64_t	object = zcp->zc_id;
+	znode_t		*zp = ITOZ(ip);
+	zfs_sb_t	*zsb = ITOZSB(ip);
+	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		i;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER(zsb);
 
 	if (fidp->fid_len < SHORT_FID_LEN) {
 		fidp->fid_len = SHORT_FID_LEN;
-		ZFS_EXIT(zfsvfs);
+		ZFS_EXIT(zsb);
 		return (ENOSPC);
 	}
 
@@ -341,42 +344,54 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
-	ZFS_EXIT(zfsvfs);
+	ZFS_EXIT(zsb);
 	return (0);
 }
 
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- * 	ENTRY			ZFSCTL_INODE
- * 	.zfs			1
- * 	.zfs/snapshot		2
- * 	.zfs/snapshot/<snap>	objectid(snap)
- */
+static int
+zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
+{
+	objset_t *os = ITOZSB(ip)->z_os;
 
-#define	ZFSCTL_INO_SNAP(id)	(id)
+	if (snapshot_namecheck(name, NULL, NULL) != 0)
+		return (EILSEQ);
+
+	dmu_objset_name(os, zname);
+	if ((strlen(zname) + 1 + strlen(name)) >= len)
+		return (ENAMETOOLONG);
+
+	(void) strcat(zname, "@");
+	(void) strcat(zname, name);
+
+	return (0);
+}
 
-/*
- * Get root directory attributes.
- */
-/* ARGSUSED */
 static int
-zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
+zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
 {
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	char *path_buffer, *path_ptr;
+	int path_len, error = 0;
+
+	path_buffer = kmem_alloc(len, KM_SLEEP);
 
-	ZFS_ENTER(zfsvfs);
-	vap->va_nodeid = ZFSCTL_INO_ROOT;
-	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+	path_ptr = d_path(path, path_buffer, len);
+	if (IS_ERR(path_ptr)) {
+		error = -PTR_ERR(path_ptr);
+		goto out;
+	}
 
-	zfsctl_common_getattr(vp, vap);
-	ZFS_EXIT(zfsvfs);
+	path_len = path_buffer + len - 1 - path_ptr;
+	if (path_len > len) {
+		error = EFAULT;
+		goto out;
+	}
 
-	return (0);
+	memcpy(zpath, path_ptr, path_len);
+	zpath[path_len] = '\0';
+out:
+	kmem_free(path_buffer, len);
+
+	return (error);
 }
 
 /*
@@ -384,776 +399,612 @@ zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
  */
 /* ARGSUSED */
 int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	int err;
-
-	/*
-	 * No extended attributes allowed under .zfs
-	 */
-	if (flags & LOOKUP_XATTR)
-		return (EINVAL);
-
-	ZFS_ENTER(zfsvfs);
-
-	if (strcmp(nm, "..") == 0) {
-		err = VFS_ROOT(dvp->v_vfsp, vpp);
+	zfs_sb_t *zsb = ITOZSB(dip);
+	int error = 0;
+
+	ZFS_ENTER(zsb);
+
+	if (strcmp(name, "..") == 0) {
+		*ipp = dip->i_sb->s_root->d_inode;
+	} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR,
+		    &zpl_fops_snapdir, &zpl_ops_snapdir);
+	} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES,
+		    &zpl_fops_shares, &zpl_ops_shares);
 	} else {
-		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
-		    cr, ct, direntflags, realpnp);
+		*ipp = NULL;
 	}
 
-	ZFS_EXIT(zfsvfs);
-
-	return (err);
-}
+	if (*ipp == NULL)
+		error = ENOENT;
 
-static const fs_operation_def_t zfsctl_tops_root[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
-	{ NULL }
-};
+	ZFS_EXIT(zsb);
 
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
-	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
-	if (snapshot_namecheck(name, NULL, NULL) != 0)
-		return (EILSEQ);
-	dmu_objset_name(os, zname);
-	if (strlen(zname) + 1 + strlen(name) >= len)
-		return (ENAMETOOLONG);
-	(void) strcat(zname, "@");
-	(void) strcat(zname, name);
-	return (0);
+	return (error);
 }
 
-static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ * Perform a mount of the associated dataset on top of the inode.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
-	vnode_t *svp = sep->se_root;
+	zfs_sb_t *zsb = ITOZSB(dip);
+	uint64_t id;
 	int error;
 
-	ASSERT(vn_ismntpt(svp));
-
-	/* this will be dropped by dounmount() */
-	if ((error = vn_vfswlock(svp)) != 0)
-		return (error);
+	ZFS_ENTER(zsb);
 
-	VN_HOLD(svp);
-	error = dounmount(vn_mountedvfs(svp), fflags, cr);
+	error = dmu_snapshot_lookup(zsb->z_os, name, &id);
 	if (error) {
-		VN_RELE(svp);
+		ZFS_EXIT(zsb);
 		return (error);
 	}
-	VFS_RELE(svp->v_vfsp);
-	/*
-	 * We can't use VN_RELE(), as that will try to invoke
-	 * zfsctl_snapdir_inactive(), which would cause us to destroy
-	 * the sd_lock mutex held by our caller.
-	 */
-	ASSERT(svp->v_count == 1);
-	gfs_vop_inactive(svp, cr, NULL);
 
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	kmem_free(sep, sizeof (zfs_snapentry_t));
+	*ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
+	    &simple_dir_operations, &simple_dir_inode_operations);
+	if (*ipp) {
+#ifdef HAVE_AUTOMOUNT
+		(*ipp)->i_flags |= S_AUTOMOUNT;
+#endif /* HAVE_AUTOMOUNT */
+	} else {
+		error = ENOENT;
+	}
 
-	return (0);
+	ZFS_EXIT(zsb);
+
+	return (error);
 }
 
 static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
 {
 	avl_index_t where;
-	vfs_t *vfsp;
-	refstr_t *pathref;
-	char newpath[MAXNAMELEN];
-	char *tail;
 
-	ASSERT(MUTEX_HELD(&sdp->sd_lock));
+	ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock));
 	ASSERT(sep != NULL);
 
-	vfsp = vn_mountedvfs(sep->se_root);
-	ASSERT(vfsp != NULL);
-
-	vfs_lock_wait(vfsp);
-
 	/*
 	 * Change the name in the AVL tree.
 	 */
-	avl_remove(&sdp->sd_snaps, sep);
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
-	avl_insert(&sdp->sd_snaps, sep, where);
-
-	/*
-	 * Change the current mountpoint info:
-	 * 	- update the tail of the mntpoint path
-	 *	- update the tail of the resource path
-	 */
-	pathref = vfs_getmntpoint(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '/')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setmntpoint(vfsp, newpath);
-
-	pathref = vfs_getresource(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '@')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setresource(vfsp, newpath);
-
-	vfs_unlock(vfsp);
+	avl_remove(&zsb->z_ctldir_snaps, sep);
+	(void) strcpy(sep->se_name, name);
+	VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL);
+	avl_insert(&zsb->z_ctldir_snaps, sep, where);
 }
 
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name.  The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
 /*ARGSUSED*/
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr, caller_context_t *ct, int flags)
+int
+zfsctl_snapdir_rename(struct inode *sdip, char *sname,
+    struct inode *tdip, char *tname, cred_t *cr, int flags)
 {
-	zfsctl_snapdir_t *sdp = sdvp->v_data;
+	zfs_sb_t *zsb = ITOZSB(sdip);
 	zfs_snapentry_t search, *sep;
-	zfsvfs_t *zfsvfs;
 	avl_index_t where;
-	char from[MAXNAMELEN], to[MAXNAMELEN];
-	char real[MAXNAMELEN];
-	int err;
+	char *to, *from, *real;
+	int error;
+
+	ZFS_ENTER(zsb);
 
-	zfsvfs = sdvp->v_vfsp->vfs_data;
-	ZFS_ENTER(zfsvfs);
+	to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	from = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
-	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+	if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zsb->z_os, sname, real,
 		    MAXNAMELEN, NULL);
-		if (err == 0) {
-			snm = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
+		if (error == 0) {
+			sname = real;
+		} else if (error != ENOTSUP) {
+			goto out;
 		}
 	}
 
-	ZFS_EXIT(zfsvfs);
-
-	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-	if (!err)
-		err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-	if (!err)
-		err = zfs_secpolicy_rename_perms(from, to, cr);
-	if (err)
-		return (err);
+	error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from);
+	if (!error)
+		error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to);
+	if (!error)
+		error = zfs_secpolicy_rename_perms(from, to, cr);
+	if (error)
+		goto out;
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
-	if (sdvp != tdvp)
-		return (EINVAL);
+	if (sdip != tdip) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * No-op when names are identical.
+	 */
+	if (strcmp(sname, tname) == 0) {
+		error = 0;
+		goto out;
+	}
 
-	if (strcmp(snm, tnm) == 0)
-		return (0);
+	mutex_enter(&zsb->z_ctldir_lock);
 
-	mutex_enter(&sdp->sd_lock);
+	error = dmu_objset_rename(from, to, B_FALSE);
+	if (error)
+		goto out_unlock;
 
-	search.se_name = (char *)snm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
-		mutex_exit(&sdp->sd_lock);
-		return (ENOENT);
-	}
+	search.se_name = (char *)sname;
+	sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
+	if (sep)
+		zfsctl_rename_snap(zsb, sep, tname);
 
-	err = dmu_objset_rename(from, to, B_FALSE);
-	if (err == 0)
-		zfsctl_rename_snap(sdp, sep, tnm);
+out_unlock:
+	mutex_exit(&zsb->z_ctldir_lock);
+out:
+	kmem_free(from, MAXNAMELEN);
+	kmem_free(to, MAXNAMELEN);
+	kmem_free(real, MAXNAMELEN);
 
-	mutex_exit(&sdp->sd_lock);
+	ZFS_EXIT(zsb);
 
-	return (err);
+	return (error);
 }
 
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
 /* ARGSUSED */
-static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+int
+zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 {
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	zfs_snapentry_t *sep;
-	zfs_snapentry_t search;
-	zfsvfs_t *zfsvfs;
-	char snapname[MAXNAMELEN];
-	char real[MAXNAMELEN];
-	int err;
+	zfs_sb_t *zsb = ITOZSB(dip);
+	char *snapname, *real;
+	int error;
 
-	zfsvfs = dvp->v_vfsp->vfs_data;
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER(zsb);
 
-	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+	snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
-		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+	if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zsb->z_os, name, real,
 		    MAXNAMELEN, NULL);
-		if (err == 0) {
+		if (error == 0) {
 			name = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
+		} else if (error != ENOTSUP) {
+			goto out;
 		}
 	}
 
-	ZFS_EXIT(zfsvfs);
+	error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
+	if (!error)
+		error = zfs_secpolicy_destroy_perms(snapname, cr);
+	if (error)
+		goto out;
 
-	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-	if (!err)
-		err = zfs_secpolicy_destroy_perms(snapname, cr);
-	if (err)
-		return (err);
+	error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
+	if ((error == 0) || (error == ENOENT))
+		error = dmu_objset_destroy(snapname, B_FALSE);
+out:
+	kmem_free(snapname, MAXNAMELEN);
+	kmem_free(real, MAXNAMELEN);
 
-	mutex_enter(&sdp->sd_lock);
-
-	search.se_name = name;
-	sep = avl_find(&sdp->sd_snaps, &search, NULL);
-	if (sep) {
-		avl_remove(&sdp->sd_snaps, sep);
-		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
-		if (err)
-			avl_add(&sdp->sd_snaps, sep);
-		else
-			err = dmu_objset_destroy(snapname);
-	} else {
-		err = ENOENT;
-	}
+	ZFS_EXIT(zsb);
 
-	mutex_exit(&sdp->sd_lock);
-
-	return (err);
+	return (error);
 }
 
 /*
- * This creates a snapshot under '.zfs/snapshot'.
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
  */
 /* ARGSUSED */
-static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
-    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+int
+zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
+	struct inode **ipp, cred_t *cr, int flags)
 {
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	char name[MAXNAMELEN];
-	int err;
-	static enum symfollow follow = NO_FOLLOW;
-	static enum uio_seg seg = UIO_SYSSPACE;
+	zfs_sb_t *zsb = ITOZSB(dip);
+	char *dsname;
+	int error;
 
-	if (snapshot_namecheck(dirname, NULL, NULL) != 0)
-		return (EILSEQ);
+	dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
-	dmu_objset_name(zfsvfs->z_os, name);
+	if (snapshot_namecheck(dirname, NULL, NULL) != 0) {
+		error = EILSEQ;
+		goto out;
+	}
+
+	dmu_objset_name(zsb->z_os, dsname);
 
-	*vpp = NULL;
+	error = zfs_secpolicy_snapshot_perms(dsname, cr);
+	if (error)
+		goto out;
 
-	err = zfs_secpolicy_snapshot_perms(name, cr);
-	if (err)
-		return (err);
+	if (error == 0) {
+		error = dmu_objset_snapshot(dsname, dirname,
+		    NULL, NULL, B_FALSE, B_FALSE, -1);
+		if (error)
+			goto out;
 
-	if (err == 0) {
-		err = dmu_objset_snapshot(name, dirname, B_FALSE);
-		if (err)
-			return (err);
-		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
+		error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+		    0, cr, NULL, NULL);
 	}
+out:
+	kmem_free(dsname, MAXNAMELEN);
 
-	return (err);
+	return (error);
 }
 
 /*
- * Lookup entry point for the 'snapshot' directory.  Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
+ * When a .zfs/snapshot/<snapshot> inode is evicted they must be removed
+ * from the snapshot list.  This will normally happen as part of the auto
+ * unmount, however in the case of a manual snapshot unmount this will be
+ * the only notification we receive.
  */
-/* ARGSUSED */
-static int
-zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+void
+zfsctl_snapdir_inactive(struct inode *ip)
 {
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	objset_t *snap;
-	char snapname[MAXNAMELEN];
-	char real[MAXNAMELEN];
-	char *mountpoint;
-	zfs_snapentry_t *sep, search;
-	struct mounta margs;
-	vfs_t *vfsp;
-	size_t mountpoint_len;
-	avl_index_t where;
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	int err;
-
-	/*
-	 * No extended attributes allowed under .zfs
-	 */
-	if (flags & LOOKUP_XATTR)
-		return (EINVAL);
+	zfs_sb_t *zsb = ITOZSB(ip);
+	zfs_snapentry_t *sep, *next;
 
-	ASSERT(dvp->v_type == VDIR);
+	mutex_enter(&zsb->z_ctldir_lock);
 
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
-		return (0);
+	sep = avl_first(&zsb->z_ctldir_snaps);
+	while (sep != NULL) {
+		next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
 
-	/*
-	 * If we get a recursive call, that means we got called
-	 * from the domount() code while it was trying to look up the
-	 * spec (which looks like a local path for zfs).  We need to
-	 * add some flag to domount() to tell it not to do this lookup.
-	 */
-	if (MUTEX_HELD(&sdp->sd_lock))
-		return (ENOENT);
+		if (sep->se_inode == ip) {
+			avl_remove(&zsb->z_ctldir_snaps, sep);
+			taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
+			zfsctl_sep_free(sep);
+			break;
+		}
+		sep = next;
+	}
 
-	ZFS_ENTER(zfsvfs);
+	mutex_exit(&zsb->z_ctldir_lock);
+}
 
-	if (flags & FIGNORECASE) {
-		boolean_t conflict = B_FALSE;
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort.  In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+#define SET_UNMOUNT_CMD \
+	"exec 0</dev/null " \
+	"     1>/dev/null " \
+	"     2>/dev/null; " \
+	"umount -t zfs -n %s'%s'"
 
-		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
-		    MAXNAMELEN, &conflict);
-		if (err == 0) {
-			nm = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-		if (realpnp)
-			(void) strlcpy(realpnp->pn_buf, nm,
-			    realpnp->pn_bufsize);
-		if (conflict && direntflags)
-			*direntflags = ED_CASE_CONFLICT;
-	}
+static int
+__zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
+{
+	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+	char *envp[] = { NULL };
+	int error;
 
-	mutex_enter(&sdp->sd_lock);
-	search.se_name = (char *)nm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
-		*vpp = sep->se_root;
-		VN_HOLD(*vpp);
-		err = traverse(vpp);
-		if (err) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-		} else if (*vpp == sep->se_root) {
-			/*
-			 * The snapshot was unmounted behind our backs,
-			 * try to remount it.
-			 */
-			goto domount;
-		} else {
-			/*
-			 * VROOT was set during the traverse call.  We need
-			 * to clear it since we're pretending to be part
-			 * of our parent's vfs.
-			 */
-			(*vpp)->v_flag &= ~VROOT;
-		}
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
+	argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
+	    flags & MNT_FORCE ? "-f " : "", sep->se_path);
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	strfree(argv[2]);
 
 	/*
-	 * The requested snapshot is not currently mounted, look it up.
+	 * The umount system utility will return 256 on error.  We must
+	 * assume this error is because the file system is busy so it is
+	 * converted to the more sensible EBUSY.
 	 */
-	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
-	if (err) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		/*
-		 * handle "ls *" or "?" in a graceful manner,
-		 * forcing EILSEQ to ENOENT.
-		 * Since shell ultimately passes "*" or "?" as name to lookup
-		 */
-		return (err == EILSEQ ? ENOENT : err);
-	}
-	if (dmu_objset_open(snapname, DMU_OST_ZFS,
-	    DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (ENOENT);
-	}
-
-	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
-	avl_insert(&sdp->sd_snaps, sep, where);
-
-	dmu_objset_close(snap);
-domount:
-	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
-	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
-	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
-	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
-	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
-
-	margs.spec = snapname;
-	margs.dir = mountpoint;
-	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
-	margs.fstype = "zfs";
-	margs.dataptr = NULL;
-	margs.datalen = 0;
-	margs.optptr = NULL;
-	margs.optlen = 0;
-
-	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
-	kmem_free(mountpoint, mountpoint_len);
-
-	if (err == 0) {
-		/*
-		 * Return the mounted root rather than the covered mount point.
-		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
-		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
-		 * vnode is the root the newly created vfsp.
-		 */
-		VFS_RELE(vfsp);
-		err = traverse(vpp);
-	}
-
-	if (err == 0) {
-		/*
-		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
-		 *
-		 * This is where we lie about our v_vfsp in order to
-		 * make .zfs/snapshot/<snapname> accessible over NFS
-		 * without requiring manual mounts of <snapname>.
-		 */
-		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
-		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
-		(*vpp)->v_vfsp = zfsvfs->z_vfs;
-		(*vpp)->v_flag &= ~VROOT;
-	}
-	mutex_exit(&sdp->sd_lock);
-	ZFS_EXIT(zfsvfs);
+	if (error)
+		error = EBUSY;
 
 	/*
-	 * If we had an error, drop our hold on the vnode and
-	 * zfsctl_snapshot_inactive() will clean up.
+	 * This was the result of a manual unmount, cancel the delayed work
+	 * to prevent zfsctl_expire_snapshot() from attempting a unmount.
 	 */
-	if (err) {
-		VN_RELE(*vpp);
-		*vpp = NULL;
-	}
-	return (err);
+	if ((error == 0) && !(flags & MNT_EXPIRE))
+		taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
+
+
+	return (error);
 }
 
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data, int flags)
+int
+zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags)
 {
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	char snapname[MAXNAMELEN];
-	uint64_t id, cookie;
-	boolean_t case_conflict;
-	int error;
+	zfs_snapentry_t search;
+	zfs_snapentry_t *sep;
+	int error = 0;
 
-	ZFS_ENTER(zfsvfs);
+	mutex_enter(&zsb->z_ctldir_lock);
 
-	cookie = *offp;
-	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-	    &cookie, &case_conflict);
-	if (error) {
-		ZFS_EXIT(zfsvfs);
-		if (error == ENOENT) {
-			*eofp = 1;
-			return (0);
-		}
-		return (error);
-	}
+	search.se_name = name;
+	sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
+	if (sep) {
+		avl_remove(&zsb->z_ctldir_snaps, sep);
+		mutex_exit(&zsb->z_ctldir_lock);
 
-	if (flags & V_RDDIR_ENTFLAGS) {
-		edirent_t *eodp = dp;
+		error = __zfsctl_unmount_snapshot(sep, flags);
 
-		(void) strcpy(eodp->ed_name, snapname);
-		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
-		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
+		mutex_enter(&zsb->z_ctldir_lock);
+		if (error == EBUSY)
+			avl_add(&zsb->z_ctldir_snaps, sep);
+		else
+			zfsctl_sep_free(sep);
 	} else {
-		struct dirent64 *odp = dp;
-
-		(void) strcpy(odp->d_name, snapname);
-		odp->d_ino = ZFSCTL_INO_SNAP(id);
+		error = ENOENT;
 	}
-	*nextp = cookie;
 
-	ZFS_EXIT(zfsvfs);
+	mutex_exit(&zsb->z_ctldir_lock);
+	ASSERT3S(error, >=, 0);
 
-	return (0);
+	return (error);
 }
 
 /*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
+ * Traverse all mounted snapshots and attempt to unmount them.  This
+ * is best effort, on failure EEXIST is returned and count will be set
+ * to the number of file snapshots which could not be unmounted.
  */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
+int
+zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
 {
-	vnode_t *vp;
-	zfsctl_snapdir_t *sdp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
-	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
-	    zfsctl_snapdir_readdir_cb, NULL);
-	sdp = vp->v_data;
-	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
-	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&sdp->sd_snaps, snapentry_compare,
-	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-	return (vp);
-}
+	zfs_snapentry_t *sep, *next;
+	int error = 0;
 
-/* ARGSUSED */
-static int
-zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_snapdir_t *sdp = vp->v_data;
+	*count = 0;
 
-	ZFS_ENTER(zfsvfs);
-	zfsctl_common_getattr(vp, vap);
-	vap->va_nodeid = gfs_file_inode(vp);
-	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
-	ZFS_EXIT(zfsvfs);
+	ASSERT(zsb->z_ctldir != NULL);
+	mutex_enter(&zsb->z_ctldir_lock);
 
-	return (0);
-}
+	sep = avl_first(&zsb->z_ctldir_snaps);
+	while (sep != NULL) {
+		next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
+		avl_remove(&zsb->z_ctldir_snaps, sep);
+		mutex_exit(&zsb->z_ctldir_lock);
 
-/* ARGSUSED */
-static void
-zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-	zfsctl_snapdir_t *sdp = vp->v_data;
-	void *private;
-
-	private = gfs_dir_inactive(vp);
-	if (private != NULL) {
-		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
-		mutex_destroy(&sdp->sd_lock);
-		avl_destroy(&sdp->sd_snaps);
-		kmem_free(private, sizeof (zfsctl_snapdir_t));
-	}
-}
+		error = __zfsctl_unmount_snapshot(sep, flags);
 
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
-	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
-	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
-	{ NULL }
-};
+		mutex_enter(&zsb->z_ctldir_lock);
+		if (error == EBUSY) {
+			avl_add(&zsb->z_ctldir_snaps, sep);
+			(*count)++;
+		} else {
+			zfsctl_sep_free(sep);
+		}
 
-/*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot.  This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
- */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
-	vnode_t *vp;
-	zfsctl_node_t *zcp;
+		sep = next;
+	}
 
-	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
-	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = objset;
-	VFS_HOLD(vp->v_vfsp);
+	mutex_exit(&zsb->z_ctldir_lock);
 
-	return (vp);
+	return ((*count > 0) ? EEXIST : 0);
 }
 
-static void
-zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+#define SET_MOUNT_CMD \
+	"exec 0</dev/null " \
+	"     1>/dev/null " \
+	"     2>/dev/null; " \
+	"mount -t zfs -n '%s' '%s'"
+
+int
+zfsctl_mount_snapshot(struct path *path, int flags)
 {
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
-	vnode_t *dvp;
+	struct dentry *dentry = path->dentry;
+	struct inode *ip = dentry->d_inode;
+	zfs_sb_t *zsb = ITOZSB(ip);
+	char *full_name, *full_path;
+	zfs_snapentry_t *sep;
+	zfs_snapentry_t search;
+	char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+	char *envp[] = { NULL };
+	int error;
 
-	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
-	sdp = dvp->v_data;
+	ZFS_ENTER(zsb);
 
-	mutex_enter(&sdp->sd_lock);
+	full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
+	full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);
 
-	if (vp->v_count > 1) {
-		mutex_exit(&sdp->sd_lock);
-		return;
-	}
-	ASSERT(!vn_ismntpt(vp));
+	error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
+	if (error)
+		goto error;
 
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
+	error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
+	if (error)
+		goto error;
 
-		if (sep->se_root == vp) {
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
-			break;
-		}
-		sep = next;
+	/*
+	 * Attempt to mount the snapshot from user space.  Normally this
+	 * would be done using the vfs_kern_mount() function, however that
+	 * function is marked GPL-only and cannot be used.  On error we
+	 * careful to log the real error to the console and return EISDIR
+	 * to safely abort the automount.  This should be very rare.
+	 */
+	argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	strfree(argv[2]);
+	if (error) {
+		printk("ZFS: Unable to automount %s at %s: %d\n",
+		    full_name, full_path, error);
+		error = EISDIR;
+		goto error;
 	}
-	ASSERT(sep != NULL);
 
-	mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
-	VFS_RELE(vp->v_vfsp);
+	mutex_enter(&zsb->z_ctldir_lock);
 
 	/*
-	 * Dispose of the vnode for the snapshot mount point.
-	 * This is safe to do because once this entry has been removed
-	 * from the AVL tree, it can't be found again, so cannot become
-	 * "active".  If we lookup the same name again we will end up
-	 * creating a new vnode.
+	 * Ensure a previous entry does not exist, if it does safely remove
+	 * it any cancel the outstanding expiration.  This can occur when a
+	 * snapshot is manually unmounted and then an automount is triggered.
 	 */
-	gfs_vop_inactive(vp, cr, ct);
+	search.se_name = full_name;
+	sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
+	if (sep) {
+		avl_remove(&zsb->z_ctldir_snaps, sep);
+		taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
+		zfsctl_sep_free(sep);
+	}
+
+	sep = zfsctl_sep_alloc();
+	sep->se_name = full_name;
+	sep->se_path = full_path;
+	sep->se_inode = ip;
+	avl_add(&zsb->z_ctldir_snaps, sep);
+
+	sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
+	    zfsctl_expire_snapshot, sep, TQ_SLEEP,
+	    ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+
+	mutex_exit(&zsb->z_ctldir_lock);
+error:
+	if (error) {
+		kmem_free(full_name, MAXNAMELEN);
+		kmem_free(full_path, PATH_MAX);
+	}
+
+	ZFS_EXIT(zsb);
+
+	return (error);
 }
 
+/*
+ * Check if this super block has a matching objset id.
+ */
+static int
+zfsctl_test_super(struct super_block *sb, void *objsetidp)
+{
+	zfs_sb_t *zsb = sb->s_fs_info;
+	uint64_t objsetid = *(uint64_t *)objsetidp;
+
+	return (dmu_objset_id(zsb->z_os) == objsetid);
+}
 
 /*
- * These VP's should never see the light of day.  They should always
- * be covered.
+ * Prevent a new super block from being allocated if an existing one
+ * could not be located.  We only want to preform a lookup operation.
  */
-static const fs_operation_def_t zfsctl_tops_snapshot[] = {
-	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
-	NULL, NULL
-};
+static int
+zfsctl_set_super(struct super_block *sb, void *objsetidp)
+{
+	return (-EEXIST);
+}
 
 int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
 {
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp, *vp;
-	zfsctl_snapdir_t *sdp;
-	zfsctl_node_t *zcp;
+	zfs_sb_t *zsb = sb->s_fs_info;
+	struct super_block *sbp;
 	zfs_snapentry_t *sep;
+	uint64_t id;
 	int error;
 
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
+	ASSERT(zsb->z_ctldir != NULL);
+
+	mutex_enter(&zsb->z_ctldir_lock);
 
-	mutex_enter(&sdp->sd_lock);
-	sep = avl_first(&sdp->sd_snaps);
+	/*
+	 * Verify that the snapshot is mounted.
+	 */
+	sep = avl_first(&zsb->z_ctldir_snaps);
 	while (sep != NULL) {
-		vp = sep->se_root;
-		zcp = vp->v_data;
-		if (zcp->zc_id == objsetid)
+		error = dmu_snapshot_lookup(zsb->z_os, sep->se_name, &id);
+		if (error)
+			goto out;
+
+		if (id == objsetid)
 			break;
 
-		sep = AVL_NEXT(&sdp->sd_snaps, sep);
+		sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
 	}
 
 	if (sep != NULL) {
-		VN_HOLD(vp);
 		/*
-		 * Return the mounted root rather than the covered mount point.
-		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
-		 * and returns the ZFS vnode mounted on top of the GFS node.
-		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+		 * Lookup the mounted root rather than the covered mount
+		 * point.  This may fail if the snapshot has just been
+		 * unmounted by an unrelated user space process.  This
+		 * race cannot occur to an expired mount point because
+		 * we hold the zsb->z_ctldir_lock to prevent the race.
 		 */
-		error = traverse(&vp);
-		if (error == 0) {
-			if (vp == sep->se_root)
-				error = EINVAL;
-			else
-				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
+		sbp = zpl_sget(&zpl_fs_type, zfsctl_test_super,
+		    zfsctl_set_super, 0, &id);
+		if (IS_ERR(sbp)) {
+			error = -PTR_ERR(sbp);
+		} else {
+			*zsbp = sbp->s_fs_info;
+			deactivate_super(sbp);
 		}
-		mutex_exit(&sdp->sd_lock);
-		VN_RELE(vp);
 	} else {
 		error = EINVAL;
-		mutex_exit(&sdp->sd_lock);
 	}
-
-	VN_RELE(dvp);
+out:
+	mutex_exit(&zsb->z_ctldir_lock);
+	ASSERT3S(error, >=, 0);
 
 	return (error);
 }
 
-/*
- * Unmount any snapshots for the given filesystem.  This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
- */
+/* ARGSUSED */
 int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
+	zfs_sb_t *zsb = ITOZSB(dip);
+	struct inode *ip;
+	znode_t *dzp;
 	int error;
 
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, cr, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
+	ZFS_ENTER(zsb);
 
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
+	if (zsb->z_shares_dir == 0) {
+		ZFS_EXIT(zsb);
+		return (ENOTSUP);
+	}
 
-		/*
-		 * If this snapshot is not mounted, then it must
-		 * have just been unmounted by somebody else, and
-		 * will be cleaned up by zfsctl_snapdir_inactive().
-		 */
-		if (vn_ismntpt(sep->se_root)) {
-			avl_remove(&sdp->sd_snaps, sep);
-			error = zfsctl_unmount_snap(sep, fflags, cr);
-			if (error) {
-				avl_add(&sdp->sd_snaps, sep);
-				break;
-			}
-		}
-		sep = next;
+	error = zfs_zget(zsb, zsb->z_shares_dir, &dzp);
+	if (error) {
+		ZFS_EXIT(zsb);
+		return (error);
 	}
 
-	mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
+	error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
+
+	iput(ZTOI(dzp));
+	ZFS_EXIT(zsb);
 
 	return (error);
 }
+
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories.  Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+	zfs_expire_taskq = taskq_create("z_unmount", 1, maxclsyspri,
+	    1, 8, TASKQ_PREPOPULATE);
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories.  In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+	taskq_destroy(zfs_expire_taskq);
+}
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");